]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
fixed bug in sffinfo when ~ was used in the sff filename. fixed issue in shhh.flows...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50
51 ShhherCommand::ShhherCommand(){ 
52         try {
53                 abort = true; calledHelp = true;
54                 setParameters();
55                 
56                 //initialize outputTypes
57 //              vector<string> tempOutNames;
58 //              outputTypes["pn.dist"] = tempOutNames;
59
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
63                 exit(1);
64         }
65 }
66
67 //**********************************************************************************************************************
68
69 ShhherCommand::ShhherCommand(string option) {
70         try {
71         
72 #ifdef USE_MPI
73                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
75         
76                 if(pid == 0){
77 #endif
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100 //                      outputTypes["pn.dist"] = tempOutNames;
101                         //                      outputTypes["fasta"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("flow");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
114                                 }
115                                 
116                                 it = parameters.find("lookup");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
122                                 }
123
124                                 it = parameters.find("file");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
130                                 }
131                         }
132             
133             //if the user changes the output directory command factory will send this info to us in the output parameter 
134                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
135             
136                         //check for required parameters
137                         flowFileName = validParameter.validFile(parameters, "flow", true);
138                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
139                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
140                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141                                 m->mothurOutEndLine();
142                                 abort = true; 
143                         }
144                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
145                         
146                         if(flowFileName != "not found"){
147                                 compositeFASTAFileName = "";    
148                                 compositeNamesFileName = "";    
149                         }
150                         else{
151                                 ofstream temp;
152                 
153                 string thisoutputDir = outputDir;
154                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
155                 
156                                 //we want to rip off .files, and also .flow if its there
157                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
158                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
159                 string extension = m->getExtension(fileroot);
160                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
161                 else { fileroot += "."; } //add back if needed
162                 
163                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
164                                 m->openOutputFile(compositeFASTAFileName, temp);
165                                 temp.close();
166                                 
167                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
168                                 m->openOutputFile(compositeNamesFileName, temp);
169                                 temp.close();
170                         }
171             
172             if(flowFilesFileName != "not found"){
173                 string fName;
174                 
175                 ifstream flowFilesFile;
176                 m->openInputFile(flowFilesFileName, flowFilesFile);
177                 while(flowFilesFile){
178                     fName = m->getline(flowFilesFile);
179                     
180                     //test if file is valid
181                     ifstream in;
182                     int ableToOpen = m->openInputFile(fName, in, "noerror");
183                     in.close(); 
184                     if (ableToOpen == 1) {
185                         if (inputDir != "") { //default path is set
186                             string tryPath = inputDir + fName;
187                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
188                             ifstream in2;
189                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
190                             in2.close();
191                             fName = tryPath;
192                         }
193                     }
194                     
195                     if (ableToOpen == 1) {
196                         if (m->getDefaultPath() != "") { //default path is set
197                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
198                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
199                             ifstream in2;
200                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
201                             in2.close();
202                             fName = tryPath;
203                         }
204                     }
205                     
206                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
207                     if (ableToOpen == 1) {
208                         string exepath = m->argv;
209                         string tempPath = exepath;
210                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
211                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
212                         
213                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
214                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
215                         ifstream in2;
216                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
217                         in2.close();
218                         fName = tryPath;
219                     }
220                     
221                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
222                     else { flowFileVector.push_back(fName); }
223                     m->gobble(flowFilesFile);
224                 }
225                 flowFilesFile.close();
226                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
227             }
228             else{
229                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
230                 flowFileVector.push_back(flowFileName);
231             }
232                 
233                         //check for optional parameter and set defaults
234                         // ...at some point should added some additional type checking...
235                         string temp;
236                         temp = validParameter.validFile(parameters, "lookup", true);
237                         if (temp == "not found")        {       
238                                 lookupFileName = "LookUp_Titanium.pat"; 
239                                 
240                                 int ableToOpen;
241                                 ifstream in;
242                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
243                                 in.close();     
244                                 
245                                 //if you can't open it, try input location
246                                 if (ableToOpen == 1) {
247                                         if (inputDir != "") { //default path is set
248                                                 string tryPath = inputDir + lookupFileName;
249                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
250                                                 ifstream in2;
251                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
252                                                 in2.close();
253                                                 lookupFileName = tryPath;
254                                         }
255                                 }
256                                 
257                                 //if you can't open it, try default location
258                                 if (ableToOpen == 1) {
259                                         if (m->getDefaultPath() != "") { //default path is set
260                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
261                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
262                                                 ifstream in2;
263                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
264                                                 in2.close();
265                                                 lookupFileName = tryPath;
266                                         }
267                                 }
268                                 
269                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
270                                 if (ableToOpen == 1) {
271                                         string exepath = m->argv;
272                                         string tempPath = exepath;
273                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
274                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
275                                         
276                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
277                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
278                                         ifstream in2;
279                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
280                                         in2.close();
281                                         lookupFileName = tryPath;
282                                 }
283                                 
284                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
285                         }
286                         else if(temp == "not open")     {       
287                                 
288                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
289                                 
290                                 //if you can't open it its not inputDir, try mothur excutable location
291                                 string exepath = m->argv;
292                                 string tempPath = exepath;
293                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
294                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
295                                         
296                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
297                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
298                                 ifstream in2;
299                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
300                                 in2.close();
301                                 lookupFileName = tryPath;
302                                 
303                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
304                         }else                                           {       lookupFileName = temp;  }
305                         
306                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
307                         m->setProcessors(temp);
308                         m->mothurConvert(temp, processors);
309
310                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
311                         m->mothurConvert(temp, cutoff); 
312                         
313                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
314                         m->mothurConvert(temp, minDelta); 
315
316                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
317                         m->mothurConvert(temp, maxIters); 
318             
319             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
320                         m->mothurConvert(temp, largeSize); 
321             if (largeSize != 0) { large = true; }
322             else { large = false;  }
323             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
324             
325 #ifdef USE_MPI
326             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
327 #endif
328             
329
330                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
331                         m->mothurConvert(temp, sigma); 
332                         
333                         flowOrder = validParameter.validFile(parameters, "order", false);
334                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
335                         else if(flowOrder.length() != 4){
336                                 m->mothurOut("The value of the order option must be four bases long\n");
337                         }
338                         
339                 }
340 #ifdef USE_MPI
341                 }                               
342 #endif
343         }
344         catch(exception& e) {
345                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
346                 exit(1);
347         }
348 }
349 //**********************************************************************************************************************
350 #ifdef USE_MPI
351 int ShhherCommand::execute(){
352         try {
353                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
354                 
355                 int tag = 1976;
356                 MPI_Status status; 
357                         
358                 if(pid == 0){
359
360                         for(int i=1;i<ncpus;i++){
361                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
362                         }
363                         if(abort == 1){ return 0;       }
364
365                         processors = ncpus;
366                         
367                         m->mothurOut("\nGetting preliminary data...\n");
368                         getSingleLookUp();      if (m->control_pressed) { return 0; }
369                         getJointLookUp();       if (m->control_pressed) { return 0; }
370                         
371             vector<string> flowFileVector;
372                         if(flowFilesFileName != "not found"){
373                                 string fName;
374                 
375                                 ifstream flowFilesFile;
376                                 m->openInputFile(flowFilesFileName, flowFilesFile);
377                                 while(flowFilesFile){
378                                         fName = m->getline(flowFilesFile);
379                                         flowFileVector.push_back(fName);
380                                         m->gobble(flowFilesFile);
381                                 }
382                         }
383                         else{
384                                 flowFileVector.push_back(flowFileName);
385                         }
386             
387                         int numFiles = flowFileVector.size();
388
389                         for(int i=1;i<ncpus;i++){
390                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
391                         }
392                         
393                         for(int i=0;i<numFiles;i++){
394                                 
395                                 if (m->control_pressed) { break; }
396                                 
397                                 double begClock = clock();
398                                 unsigned long long begTime = time(NULL);
399
400                                 flowFileName = flowFileVector[i];
401                                 
402                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
403                                 m->mothurOut("Reading flowgrams...\n");
404                                 getFlowData();
405                                 
406                                 if (m->control_pressed) { break; }
407
408                                 m->mothurOut("Identifying unique flowgrams...\n");
409                                 getUniques();
410                                 
411                                 if (m->control_pressed) { break; }
412
413                                 m->mothurOut("Calculating distances between flowgrams...\n");
414                                 char fileName[1024];
415                                 strcpy(fileName, flowFileName.c_str());
416
417                                 for(int i=1;i<ncpus;i++){
418                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
419
420                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
422                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
423                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
424                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
425                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
426                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
427                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
428                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
429                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
430                                 }                               
431                                                         
432                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
433                                 
434                                 if (m->control_pressed) { break; }
435                                 
436                                 int done;
437                                 for(int i=1;i<ncpus;i++){
438                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
439                                         
440                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
441                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
442                                 }
443
444                                 string namesFileName = createNamesFile();
445                                 
446                                 if (m->control_pressed) { break; }
447                                 
448                                 m->mothurOut("\nClustering flowgrams...\n");
449                                 string listFileName = cluster(distFileName, namesFileName);
450                                 
451                                 if (m->control_pressed) { break; }
452                                 
453                 
454                 
455                                 getOTUData(listFileName);
456
457                                 m->mothurRemove(distFileName);
458                                 m->mothurRemove(namesFileName);
459                                 m->mothurRemove(listFileName);
460                                 
461                                 if (m->control_pressed) { break; }
462                                 
463                                 initPyroCluster();
464
465                                 if (m->control_pressed) { break; }
466                                 
467             
468                                 for(int i=1;i<ncpus;i++){
469                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
470                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
471                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
472                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
473                                 }
474                                 
475                                 if (m->control_pressed) { break; }
476                                 
477                                 double maxDelta = 0;
478                                 int iter = 0;
479                                 
480                                 int numOTUsOnCPU = numOTUs / ncpus;
481                                 int numSeqsOnCPU = numSeqs / ncpus;
482                                 m->mothurOut("\nDenoising flowgrams...\n");
483                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
484                                 
485                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
486                                         
487                                         double cycClock = clock();
488                                         unsigned long long cycTime = time(NULL);
489                                         fill();
490                                         
491                                         if (m->control_pressed) { break; }
492
493                                         int total = singleTau.size();
494                                         for(int i=1;i<ncpus;i++){
495                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
497                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
498                                                 
499                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
500                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
501                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
502                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
503                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
504                                         }
505                                         
506                                         calcCentroidsDriver(0, numOTUsOnCPU);
507                                         
508                                         for(int i=1;i<ncpus;i++){
509                                                 int otuStart = i * numOTUs / ncpus;
510                                                 int otuStop = (i + 1) * numOTUs / ncpus;
511                                                 
512                                                 vector<int> tempCentroids(numOTUs, 0);
513                                                 vector<short> tempChange(numOTUs, 0);
514                                                 
515                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
516                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
517                                                 
518                                                 for(int j=otuStart;j<otuStop;j++){
519                                                         centroids[j] = tempCentroids[j];
520                                                         change[j] = tempChange[j];
521                                                 }
522                                         }
523                                                                         
524                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
525                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
526                                         checkCentroids(); if (m->control_pressed) { break; }
527                                         
528                                         for(int i=1;i<ncpus;i++){
529                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
530                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
531                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
532                                         }
533                                         
534                                         calcNewDistancesParent(0, numSeqsOnCPU);
535
536                                         total = singleTau.size();
537
538                                         for(int i=1;i<ncpus;i++){
539                                                 int childTotal;
540                                                 int seqStart = i * numSeqs / ncpus;
541                                                 int seqStop = (i + 1) * numSeqs / ncpus;
542                                                 
543                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544
545                                                 vector<int> childSeqIndex(childTotal, 0);
546                                                 vector<double> childSingleTau(childTotal, 0);
547                                                 vector<double> childDist(numSeqs * numOTUs, 0);
548                                                 vector<int> otuIndex(childTotal, 0);
549                                                 
550                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
551                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
552                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
553                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
554
555                                                 int oldTotal = total;
556                                                 total += childTotal;
557                                                 singleTau.resize(total, 0);
558                                                 seqIndex.resize(total, 0);
559                                                 seqNumber.resize(total, 0);
560                                                 
561                                                 int childIndex = 0;
562                                                 
563                                                 for(int j=oldTotal;j<total;j++){
564                                                         int otuI = otuIndex[childIndex];
565                                                         int seqI = childSeqIndex[childIndex];
566                                                         
567                                                         singleTau[j] = childSingleTau[childIndex];
568                                                         
569                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
570                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
571                                                         nSeqsPerOTU[otuI]++;
572                                                         childIndex++;
573                                                 }
574                                                 
575                                                 int index = seqStart * numOTUs;
576                                                 for(int j=seqStart;j<seqStop;j++){
577                                                         for(int k=0;k<numOTUs;k++){
578                                                                 dist[index] = childDist[index];
579                                                                 index++;
580                                                         }
581                                                 }                                       
582                                         }
583                                         
584                                         iter++;
585                                         
586                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
587
588                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
589                                                 int live = 1;
590                                                 for(int i=1;i<ncpus;i++){
591                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
592                                                 }
593                                         }
594                                         else{
595                                                 int live = 0;
596                                                 for(int i=1;i<ncpus;i++){
597                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
598                                                 }
599                                         }
600                                         
601                                 }       
602                                 
603                                 if (m->control_pressed) { break; }
604                                 
605                                 m->mothurOut("\nFinalizing...\n");
606                                 fill();
607                                 
608                                 if (m->control_pressed) { break; }
609                                 
610                                 setOTUs();
611                                 
612                                 vector<int> otuCounts(numOTUs, 0);
613                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
614                                 calcCentroidsDriver(0, numOTUs);
615                                 
616                                 if (m->control_pressed) { break; }
617                                 
618                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
619                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
620                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
621                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
622                                 writeGroups();                          if (m->control_pressed) { break; }
623                                 
624                                                                  
625                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
626                         }
627                 }
628                 else{
629                         int abort = 1;
630
631                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
632                         if(abort){      return 0;       }
633
634                         int numFiles;
635                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
636
637                         for(int i=0;i<numFiles;i++){
638                                 
639                                 if (m->control_pressed) { break; }
640                                 
641                                 //Now into the pyrodist part
642                                 bool live = 1;
643
644                                 char fileName[1024];
645                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
646                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
647                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
648                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
649                                 
650                                 flowDataIntI.resize(numSeqs * numFlowCells);
651                                 flowDataPrI.resize(numSeqs * numFlowCells);
652                                 mapUniqueToSeq.resize(numSeqs);
653                                 mapSeqToUnique.resize(numSeqs);
654                                 lengths.resize(numSeqs);
655                                 jointLookUp.resize(NUMBINS * NUMBINS);
656
657                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
658                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
659                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
660                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
661                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
662                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
663                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
664
665                                 flowFileName = string(fileName);
666                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
667                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
668                                 
669                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
670                                 
671                                 if (m->control_pressed) { break; }
672                                 
673                                 int done = 1;
674                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
675
676                                 //Now into the pyronoise part
677                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
678                                 
679                                 singleLookUp.resize(HOMOPS * NUMBINS);
680                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
681                                 weight.resize(numOTUs);
682                                 centroids.resize(numOTUs);
683                                 change.resize(numOTUs);
684                                 dist.assign(numOTUs * numSeqs, 0);
685                                 nSeqsPerOTU.resize(numOTUs);
686                                 cumNumSeqs.resize(numOTUs);
687
688                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
689                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
690                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                 
692                                 int startOTU = pid * numOTUs / ncpus;
693                                 int endOTU = (pid + 1) * numOTUs / ncpus;
694
695                                 int startSeq = pid * numSeqs / ncpus;
696                                 int endSeq = (pid + 1) * numSeqs /ncpus;
697                                 
698                                 int total;
699
700                                 while(live){
701                                         
702                                         if (m->control_pressed) { break; }
703                                         
704                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
705                                         singleTau.assign(total, 0.0000);
706                                         seqNumber.assign(total, 0);
707                                         seqIndex.assign(total, 0);
708
709                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
710                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
711                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
712                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
713                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
714                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
715                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716
717                                         calcCentroidsDriver(startOTU, endOTU);
718
719                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
720                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
721
722                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
723                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
724                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
725
726                                         vector<int> otuIndex(total, 0);
727                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
728                                         total = otuIndex.size();
729                                         
730                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
731                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
732                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
733                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
734                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
735                                 
736                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
737                                 }
738                         }
739                 }
740                 
741                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
742                 
743                 MPI_Barrier(MPI_COMM_WORLD);
744
745                 
746                 if(compositeFASTAFileName != ""){
747                         outputNames.push_back(compositeFASTAFileName);
748                         outputNames.push_back(compositeNamesFileName);
749                 }
750
751                 m->mothurOutEndLine();
752                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
753                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
754                 m->mothurOutEndLine();
755                 
756                 return 0;
757
758         }
759         catch(exception& e) {
760                 m->errorOut(e, "ShhherCommand", "execute");
761                 exit(1);
762         }
763 }
764 /**************************************************************************************************/
765 string ShhherCommand::createNamesFile(){
766         try{
767                 
768                 vector<string> duplicateNames(numUniques, "");
769                 for(int i=0;i<numSeqs;i++){
770                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
771                 }
772                 
773                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
774                 
775                 ofstream nameFile;
776                 m->openOutputFile(nameFileName, nameFile);
777                 
778                 for(int i=0;i<numUniques;i++){
779                         
780                         if (m->control_pressed) { break; }
781                         
782             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
783                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
784                 }
785                 
786                 nameFile.close();
787                 return  nameFileName;
788         }
789         catch(exception& e) {
790                 m->errorOut(e, "ShhherCommand", "createNamesFile");
791                 exit(1);
792         }
793 }
794 /**************************************************************************************************/
795
796 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
797         try{            
798                 ostringstream outStream;
799                 outStream.setf(ios::fixed, ios::floatfield);
800                 outStream.setf(ios::dec, ios::basefield);
801                 outStream.setf(ios::showpoint);
802                 outStream.precision(6);
803                 
804                 int begTime = time(NULL);
805                 double begClock = clock();
806                 
807                 for(int i=startSeq;i<stopSeq;i++){
808                         
809                         if (m->control_pressed) { break; }
810                         
811                         for(int j=0;j<i;j++){
812                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
813                                 
814                                 if(flowDistance < 1e-6){
815                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
816                                 }
817                                 else if(flowDistance <= cutoff){
818                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
819                                 }
820                         }
821                         if(i % 100 == 0){
822                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
823                         }
824                 }
825                 
826                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
827                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
828                 
829                 if (m->control_pressed) { return fDistFileName; }
830                 
831                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
832
833                 ofstream distFile(fDistFileName.c_str());
834                 distFile << outStream.str();            
835                 distFile.close();
836                 
837                 return fDistFileName;
838         }
839         catch(exception& e) {
840                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
841                 exit(1);
842         }
843 }
844 /**************************************************************************************************/
845  
846 void ShhherCommand::getOTUData(string listFileName){
847     try {
848         
849         ifstream listFile;
850         m->openInputFile(listFileName, listFile);
851         string label;
852         
853         listFile >> label >> numOTUs;
854         
855         otuData.assign(numSeqs, 0);
856         cumNumSeqs.assign(numOTUs, 0);
857         nSeqsPerOTU.assign(numOTUs, 0);
858         aaP.clear();aaP.resize(numOTUs);
859         
860         seqNumber.clear();
861         aaI.clear();
862         seqIndex.clear();
863         
864         string singleOTU = "";
865         
866         for(int i=0;i<numOTUs;i++){
867             
868             if (m->control_pressed) { break; }
869             
870             listFile >> singleOTU;
871             
872             istringstream otuString(singleOTU);
873             
874             while(otuString){
875                 
876                 string seqName = "";
877                 
878                 for(int j=0;j<singleOTU.length();j++){
879                     char letter = otuString.get();
880                     
881                     if(letter != ','){
882                         seqName += letter;
883                     }
884                     else{
885                         map<string,int>::iterator nmIt = nameMap.find(seqName);
886                         int index = nmIt->second;
887                         
888                         nameMap.erase(nmIt);
889                         
890                         otuData[index] = i;
891                         nSeqsPerOTU[i]++;
892                         aaP[i].push_back(index);
893                         seqName = "";
894                     }
895                 }
896                 
897                 map<string,int>::iterator nmIt = nameMap.find(seqName);
898                 
899                 int index = nmIt->second;
900                 nameMap.erase(nmIt);
901                 
902                 otuData[index] = i;
903                 nSeqsPerOTU[i]++;
904                 aaP[i].push_back(index);        
905                 
906                 otuString.get();
907             }
908             
909             sort(aaP[i].begin(), aaP[i].end());
910             for(int j=0;j<nSeqsPerOTU[i];j++){
911                 seqNumber.push_back(aaP[i][j]);
912             }
913             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
914                 aaP[i].push_back(0);
915             }
916             
917             
918         }
919         
920         for(int i=1;i<numOTUs;i++){
921             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
922         }
923         aaI = aaP;
924         seqIndex = seqNumber;
925         
926         listFile.close();       
927         
928     }
929     catch(exception& e) {
930         m->errorOut(e, "ShhherCommand", "getOTUData");
931         exit(1);        
932     }           
933 }
934
935 /**************************************************************************************************/
936
937 void ShhherCommand::initPyroCluster(){                          
938     try{
939         if (numOTUs < processors) { processors = 1; }
940         
941         dist.assign(numSeqs * numOTUs, 0);
942         change.assign(numOTUs, 1);
943         centroids.assign(numOTUs, -1);
944         weight.assign(numOTUs, 0);
945         singleTau.assign(numSeqs, 1.0);
946         
947         nSeqsBreaks.assign(processors+1, 0);
948         nOTUsBreaks.assign(processors+1, 0);
949         
950         nSeqsBreaks[0] = 0;
951         for(int i=0;i<processors;i++){
952             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
953             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
954         }
955         nSeqsBreaks[processors] = numSeqs;
956         nOTUsBreaks[processors] = numOTUs;
957     }
958     catch(exception& e) {
959         m->errorOut(e, "ShhherCommand", "initPyroCluster");
960         exit(1);        
961     }
962 }
963
964 /**************************************************************************************************/
965
966 void ShhherCommand::fill(){
967     try {
968         int index = 0;
969         for(int i=0;i<numOTUs;i++){
970             
971             if (m->control_pressed) { break; }
972             
973             cumNumSeqs[i] = index;
974             for(int j=0;j<nSeqsPerOTU[i];j++){
975                 seqNumber[index] = aaP[i][j];
976                 seqIndex[index] = aaI[i][j];
977                 
978                 index++;
979             }
980         }
981     }
982     catch(exception& e) {
983         m->errorOut(e, "ShhherCommand", "fill");
984         exit(1);        
985     }           
986 }
987
988 /**************************************************************************************************/
989  
990 void ShhherCommand::getFlowData(){
991     try{
992         ifstream flowFile;
993         m->openInputFile(flowFileName, flowFile);
994         
995         string seqName;
996         seqNameVector.clear();
997         lengths.clear();
998         flowDataIntI.clear();
999         nameMap.clear();
1000         
1001         
1002         int currentNumFlowCells;
1003         
1004         float intensity;
1005         
1006         flowFile >> numFlowCells;
1007         int index = 0;//pcluster
1008         while(!flowFile.eof()){
1009             
1010             if (m->control_pressed) { break; }
1011             
1012             flowFile >> seqName >> currentNumFlowCells;
1013             lengths.push_back(currentNumFlowCells);
1014             
1015             seqNameVector.push_back(seqName);
1016             nameMap[seqName] = index++;//pcluster
1017             
1018             for(int i=0;i<numFlowCells;i++){
1019                 flowFile >> intensity;
1020                 if(intensity > 9.99)    {       intensity = 9.99;       }
1021                 int intI = int(100 * intensity + 0.0001);
1022                 flowDataIntI.push_back(intI);
1023             }
1024             m->gobble(flowFile);
1025         }
1026         flowFile.close();
1027         
1028         numSeqs = seqNameVector.size();         
1029         
1030         for(int i=0;i<numSeqs;i++){
1031             
1032             if (m->control_pressed) { break; }
1033             
1034             int iNumFlowCells = i * numFlowCells;
1035             for(int j=lengths[i];j<numFlowCells;j++){
1036                 flowDataIntI[iNumFlowCells + j] = 0;
1037             }
1038         }
1039         
1040     }
1041     catch(exception& e) {
1042         m->errorOut(e, "ShhherCommand", "getFlowData");
1043         exit(1);
1044     }
1045 }
1046 /**************************************************************************************************/
1047 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1048         
1049         try{
1050                 vector<double> newTau(numOTUs,0);
1051                 vector<double> norms(numSeqs, 0);
1052                 otuIndex.clear();
1053                 seqIndex.clear();
1054                 singleTau.clear();
1055                 
1056                 for(int i=startSeq;i<stopSeq;i++){
1057                         
1058                         if (m->control_pressed) { break; }
1059                         
1060                         double offset = 1e8;
1061                         int indexOffset = i * numOTUs;
1062                         
1063                         for(int j=0;j<numOTUs;j++){
1064                                 
1065                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1066                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1067                                 }
1068                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1069                                         offset = dist[indexOffset + j];
1070                                 }
1071                         }
1072                         
1073                         for(int j=0;j<numOTUs;j++){
1074                                 if(weight[j] > MIN_WEIGHT){
1075                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1076                                         norms[i] += newTau[j];
1077                                 }
1078                                 else{
1079                                         newTau[j] = 0.0;
1080                                 }
1081                         }
1082                         
1083                         for(int j=0;j<numOTUs;j++){
1084                 
1085                                 newTau[j] /= norms[i];
1086                                 
1087                                 if(newTau[j] > MIN_TAU){
1088                                         otuIndex.push_back(j);
1089                                         seqIndex.push_back(i);
1090                                         singleTau.push_back(newTau[j]);
1091                                 }
1092                         }
1093                         
1094                 }
1095         }
1096         catch(exception& e) {
1097                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1098                 exit(1);        
1099         }               
1100 }
1101
1102 /**************************************************************************************************/
1103
1104 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1105     
1106     try{
1107         
1108         int total = 0;
1109         vector<double> newTau(numOTUs,0);
1110         vector<double> norms(numSeqs, 0);
1111         nSeqsPerOTU.assign(numOTUs, 0);
1112         
1113         for(int i=startSeq;i<stopSeq;i++){
1114             
1115             if (m->control_pressed) { break; }
1116             
1117             int indexOffset = i * numOTUs;
1118             
1119             double offset = 1e8;
1120             
1121             for(int j=0;j<numOTUs;j++){
1122                 
1123                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1124                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1125                 }
1126                 
1127                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1128                     offset = dist[indexOffset + j];
1129                 }
1130             }
1131             
1132             for(int j=0;j<numOTUs;j++){
1133                 if(weight[j] > MIN_WEIGHT){
1134                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1135                     norms[i] += newTau[j];
1136                 }
1137                 else{
1138                     newTau[j] = 0.0;
1139                 }
1140             }
1141             
1142             for(int j=0;j<numOTUs;j++){
1143                 newTau[j] /= norms[i];
1144             }
1145             
1146             for(int j=0;j<numOTUs;j++){
1147                 if(newTau[j] > MIN_TAU){
1148                     
1149                     int oldTotal = total;
1150                     
1151                     total++;
1152                     
1153                     singleTau.resize(total, 0);
1154                     seqNumber.resize(total, 0);
1155                     seqIndex.resize(total, 0);
1156                     
1157                     singleTau[oldTotal] = newTau[j];
1158                     
1159                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1160                     aaI[j][nSeqsPerOTU[j]] = i;
1161                     nSeqsPerOTU[j]++;
1162                 }
1163             }
1164             
1165         }
1166         
1167     }
1168     catch(exception& e) {
1169         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1170         exit(1);        
1171     }           
1172 }
1173
1174 /**************************************************************************************************/
1175
1176 void ShhherCommand::setOTUs(){
1177     
1178     try {
1179         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1180         
1181         for(int i=0;i<numOTUs;i++){
1182             
1183             if (m->control_pressed) { break; }
1184             
1185             for(int j=0;j<nSeqsPerOTU[i];j++){
1186                 int index = cumNumSeqs[i] + j;
1187                 double tauValue = singleTau[seqNumber[index]];
1188                 int sIndex = seqIndex[index];
1189                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1190             }
1191         }
1192         
1193         for(int i=0;i<numSeqs;i++){
1194             double maxTau = -1.0000;
1195             int maxOTU = -1;
1196             for(int j=0;j<numOTUs;j++){
1197                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1198                     maxTau = bigTauMatrix[i * numOTUs + j];
1199                     maxOTU = j;
1200                 }
1201             }
1202             
1203             otuData[i] = maxOTU;
1204         }
1205         
1206         nSeqsPerOTU.assign(numOTUs, 0);         
1207         
1208         for(int i=0;i<numSeqs;i++){
1209             int index = otuData[i];
1210             
1211             singleTau[i] = 1.0000;
1212             dist[i] = 0.0000;
1213             
1214             aaP[index][nSeqsPerOTU[index]] = i;
1215             aaI[index][nSeqsPerOTU[index]] = i;
1216             
1217             nSeqsPerOTU[index]++;
1218         }
1219         fill(); 
1220     }
1221     catch(exception& e) {
1222         m->errorOut(e, "ShhherCommand", "setOTUs");
1223         exit(1);        
1224     }           
1225 }
1226
1227 /**************************************************************************************************/
1228  
1229 void ShhherCommand::getUniques(){
1230     try{
1231         
1232         
1233         numUniques = 0;
1234         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1235         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1236         uniqueLengths.assign(numSeqs, 0);
1237         mapSeqToUnique.assign(numSeqs, -1);
1238         mapUniqueToSeq.assign(numSeqs, -1);
1239         
1240         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1241         
1242         for(int i=0;i<numSeqs;i++){
1243             
1244             if (m->control_pressed) { break; }
1245             
1246             int index = 0;
1247             
1248             vector<short> current(numFlowCells);
1249             for(int j=0;j<numFlowCells;j++){
1250                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1251             }
1252             
1253             for(int j=0;j<numUniques;j++){
1254                 int offset = j * numFlowCells;
1255                 bool toEnd = 1;
1256                 
1257                 int shorterLength;
1258                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1259                 else                                                            {       shorterLength = uniqueLengths[j];       }
1260                 
1261                 for(int k=0;k<shorterLength;k++){
1262                     if(current[k] != uniqueFlowgrams[offset + k]){
1263                         toEnd = 0;
1264                         break;
1265                     }
1266                 }
1267                 
1268                 if(toEnd){
1269                     mapSeqToUnique[i] = j;
1270                     uniqueCount[j]++;
1271                     index = j;
1272                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1273                     break;
1274                 }
1275                 index++;
1276             }
1277             
1278             if(index == numUniques){
1279                 uniqueLengths[numUniques] = lengths[i];
1280                 uniqueCount[numUniques] = 1;
1281                 mapSeqToUnique[i] = numUniques;//anMap
1282                 mapUniqueToSeq[numUniques] = i;//anF
1283                 
1284                 for(int k=0;k<numFlowCells;k++){
1285                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1286                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1287                 }
1288                 
1289                 numUniques++;
1290             }
1291         }
1292         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1293         uniqueLengths.resize(numUniques);       
1294         
1295         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1296         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1297     }
1298     catch(exception& e) {
1299         m->errorOut(e, "ShhherCommand", "getUniques");
1300         exit(1);
1301     }
1302 }
1303
1304 /**************************************************************************************************/
1305
1306 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1307     try{
1308         int minLength = lengths[mapSeqToUnique[seqA]];
1309         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1310         
1311         int ANumFlowCells = seqA * numFlowCells;
1312         int BNumFlowCells = seqB * numFlowCells;
1313         
1314         float dist = 0;
1315         
1316         for(int i=0;i<minLength;i++){
1317             
1318             if (m->control_pressed) { break; }
1319             
1320             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1321             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1322             
1323             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1324             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1325             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1326         }
1327         
1328         dist /= (float) minLength;
1329         return dist;
1330     }
1331     catch(exception& e) {
1332         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1333         exit(1);
1334     }
1335 }
1336
1337 //**********************************************************************************************************************/
1338
1339 string ShhherCommand::cluster(string distFileName, string namesFileName){
1340     try {
1341         
1342         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1343         read->setCutoff(cutoff);
1344         
1345         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1346         clusterNameMap->readMap();
1347         read->read(clusterNameMap);
1348         
1349         ListVector* list = read->getListVector();
1350         SparseMatrix* matrix = read->getMatrix();
1351         
1352         delete read; 
1353         delete clusterNameMap; 
1354         
1355         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1356         
1357         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1358         string tag = cluster->getTag();
1359         
1360         double clusterCutoff = cutoff;
1361         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1362             
1363             if (m->control_pressed) { break; }
1364             
1365             cluster->update(clusterCutoff);
1366         }
1367         
1368         list->setLabel(toString(cutoff));
1369         
1370         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1371         ofstream listFile;
1372         m->openOutputFile(listFileName, listFile);
1373         list->print(listFile);
1374         listFile.close();
1375         
1376         delete matrix;  delete cluster; delete rabund; delete list;
1377         
1378         return listFileName;
1379     }
1380     catch(exception& e) {
1381         m->errorOut(e, "ShhherCommand", "cluster");
1382         exit(1);        
1383     }           
1384 }
1385
1386 /**************************************************************************************************/
1387
1388 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1389     
1390     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1391     //within an otu
1392     
1393     try{
1394         
1395         for(int i=start;i<finish;i++){
1396             
1397             if (m->control_pressed) { break; }
1398             
1399             double count = 0;
1400             int position = 0;
1401             int minFlowGram = 100000000;
1402             double minFlowValue = 1e8;
1403             change[i] = 0; //FALSE
1404             
1405             for(int j=0;j<nSeqsPerOTU[i];j++){
1406                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1407             }
1408             
1409             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1410                 vector<double> adF(nSeqsPerOTU[i]);
1411                 vector<int> anL(nSeqsPerOTU[i]);
1412                 
1413                 for(int j=0;j<nSeqsPerOTU[i];j++){
1414                     int index = cumNumSeqs[i] + j;
1415                     int nI = seqIndex[index];
1416                     int nIU = mapSeqToUnique[nI];
1417                     
1418                     int k;
1419                     for(k=0;k<position;k++){
1420                         if(nIU == anL[k]){
1421                             break;
1422                         }
1423                     }
1424                     if(k == position){
1425                         anL[position] = nIU;
1426                         adF[position] = 0.0000;
1427                         position++;
1428                     }                                           
1429                 }
1430                 
1431                 for(int j=0;j<nSeqsPerOTU[i];j++){
1432                     int index = cumNumSeqs[i] + j;
1433                     int nI = seqIndex[index];
1434                     
1435                     double tauValue = singleTau[seqNumber[index]];
1436                     
1437                     for(int k=0;k<position;k++){
1438                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1439                         adF[k] += dist * tauValue;
1440                     }
1441                 }
1442                 
1443                 for(int j=0;j<position;j++){
1444                     if(adF[j] < minFlowValue){
1445                         minFlowGram = j;
1446                         minFlowValue = adF[j];
1447                     }
1448                 }
1449                 
1450                 if(centroids[i] != anL[minFlowGram]){
1451                     change[i] = 1;
1452                     centroids[i] = anL[minFlowGram];
1453                 }
1454             }
1455             else if(centroids[i] != -1){
1456                 change[i] = 1;
1457                 centroids[i] = -1;                      
1458             }
1459         }
1460     }
1461     catch(exception& e) {
1462         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1463         exit(1);        
1464     }           
1465 }
1466
1467 /**************************************************************************************************/
1468
1469 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1470     try{
1471         
1472         int flowAValue = cent * numFlowCells;
1473         int flowBValue = flow * numFlowCells;
1474         
1475         double dist = 0;
1476         
1477         for(int i=0;i<length;i++){
1478             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1479             flowAValue++;
1480             flowBValue++;
1481         }
1482         
1483         return dist / (double)length;
1484     }
1485     catch(exception& e) {
1486         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1487         exit(1);        
1488     }           
1489 }
1490
1491 /**************************************************************************************************/
1492
1493 double ShhherCommand::getNewWeights(){
1494     try{
1495         
1496         double maxChange = 0;
1497         
1498         for(int i=0;i<numOTUs;i++){
1499             
1500             if (m->control_pressed) { break; }
1501             
1502             double difference = weight[i];
1503             weight[i] = 0;
1504             
1505             for(int j=0;j<nSeqsPerOTU[i];j++){
1506                 int index = cumNumSeqs[i] + j;
1507                 double tauValue = singleTau[seqNumber[index]];
1508                 weight[i] += tauValue;
1509             }
1510             
1511             difference = fabs(weight[i] - difference);
1512             if(difference > maxChange){ maxChange = difference; }
1513         }
1514         return maxChange;
1515     }
1516     catch(exception& e) {
1517         m->errorOut(e, "ShhherCommand", "getNewWeights");
1518         exit(1);        
1519     }           
1520 }
1521  
1522  /**************************************************************************************************/
1523  
1524 double ShhherCommand::getLikelihood(){
1525     
1526     try{
1527         
1528         vector<long double> P(numSeqs, 0);
1529         int effNumOTUs = 0;
1530         
1531         for(int i=0;i<numOTUs;i++){
1532             if(weight[i] > MIN_WEIGHT){
1533                 effNumOTUs++;
1534             }
1535         }
1536         
1537         string hold;
1538         for(int i=0;i<numOTUs;i++){
1539             
1540             if (m->control_pressed) { break; }
1541             
1542             for(int j=0;j<nSeqsPerOTU[i];j++){
1543                 int index = cumNumSeqs[i] + j;
1544                 int nI = seqIndex[index];
1545                 double singleDist = dist[seqNumber[index]];
1546                 
1547                 P[nI] += weight[i] * exp(-singleDist * sigma);
1548             }
1549         }
1550         double nLL = 0.00;
1551         for(int i=0;i<numSeqs;i++){
1552             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1553             
1554             nLL += -log(P[i]);
1555         }
1556         
1557         nLL = nLL -(double)numSeqs * log(sigma);
1558         
1559         return nLL; 
1560     }
1561     catch(exception& e) {
1562         m->errorOut(e, "ShhherCommand", "getNewWeights");
1563         exit(1);        
1564     }           
1565 }
1566
1567 /**************************************************************************************************/
1568
1569 void ShhherCommand::checkCentroids(){
1570     try{
1571         vector<int> unique(numOTUs, 1);
1572         
1573         for(int i=0;i<numOTUs;i++){
1574             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1575                 unique[i] = -1;
1576             }
1577         }
1578         
1579         for(int i=0;i<numOTUs;i++){
1580             
1581             if (m->control_pressed) { break; }
1582             
1583             if(unique[i] == 1){
1584                 for(int j=i+1;j<numOTUs;j++){
1585                     if(unique[j] == 1){
1586                         
1587                         if(centroids[j] == centroids[i]){
1588                             unique[j] = 0;
1589                             centroids[j] = -1;
1590                             
1591                             weight[i] += weight[j];
1592                             weight[j] = 0.0;
1593                         }
1594                     }
1595                 }
1596             }
1597         }
1598     }
1599     catch(exception& e) {
1600         m->errorOut(e, "ShhherCommand", "checkCentroids");
1601         exit(1);        
1602     }           
1603 }
1604  /**************************************************************************************************/
1605
1606
1607  
1608 void ShhherCommand::writeQualities(vector<int> otuCounts){
1609     
1610     try {
1611         string thisOutputDir = outputDir;
1612         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1613         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1614         
1615         ofstream qualityFile;
1616         m->openOutputFile(qualityFileName, qualityFile);
1617         
1618         qualityFile.setf(ios::fixed, ios::floatfield);
1619         qualityFile.setf(ios::showpoint);
1620         qualityFile << setprecision(6);
1621         
1622         vector<vector<int> > qualities(numOTUs);
1623         vector<double> pr(HOMOPS, 0);
1624         
1625         
1626         for(int i=0;i<numOTUs;i++){
1627             
1628             if (m->control_pressed) { break; }
1629             
1630             int index = 0;
1631             int base = 0;
1632             
1633             if(nSeqsPerOTU[i] > 0){
1634                 qualities[i].assign(1024, -1);
1635                 
1636                 while(index < numFlowCells){
1637                     double maxPrValue = 1e8;
1638                     short maxPrIndex = -1;
1639                     double count = 0.0000;
1640                     
1641                     pr.assign(HOMOPS, 0);
1642                     
1643                     for(int j=0;j<nSeqsPerOTU[i];j++){
1644                         int lIndex = cumNumSeqs[i] + j;
1645                         double tauValue = singleTau[seqNumber[lIndex]];
1646                         int sequenceIndex = aaI[i][j];
1647                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1648                         
1649                         count += tauValue;
1650                         
1651                         for(int s=0;s<HOMOPS;s++){
1652                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1653                         }
1654                     }
1655                     
1656                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1657                     maxPrValue = pr[maxPrIndex];
1658                     
1659                     if(count > MIN_COUNT){
1660                         double U = 0.0000;
1661                         double norm = 0.0000;
1662                         
1663                         for(int s=0;s<HOMOPS;s++){
1664                             norm += exp(-(pr[s] - maxPrValue));
1665                         }
1666                         
1667                         for(int s=1;s<=maxPrIndex;s++){
1668                             int value = 0;
1669                             double temp = 0.0000;
1670                             
1671                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1672                             
1673                             if(U>0.00){
1674                                 temp = log10(U);
1675                             }
1676                             else{
1677                                 temp = -10.1;
1678                             }
1679                             temp = floor(-10 * temp);
1680                             value = (int)floor(temp);
1681                             if(value > 100){    value = 100;    }
1682                             
1683                             qualities[i][base] = (int)value;
1684                             base++;
1685                         }
1686                     }
1687                     
1688                     index++;
1689                 }
1690             }
1691             
1692             
1693             if(otuCounts[i] > 0){
1694                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1695                 
1696                 int j=4;        //need to get past the first four bases
1697                 while(qualities[i][j] != -1){
1698                     qualityFile << qualities[i][j] << ' ';
1699                     j++;
1700                 }
1701                 qualityFile << endl;
1702             }
1703         }
1704         qualityFile.close();
1705         outputNames.push_back(qualityFileName);
1706         
1707     }
1708     catch(exception& e) {
1709         m->errorOut(e, "ShhherCommand", "writeQualities");
1710         exit(1);        
1711     }           
1712 }
1713
1714 /**************************************************************************************************/
1715
1716 void ShhherCommand::writeSequences(vector<int> otuCounts){
1717     try {
1718         string thisOutputDir = outputDir;
1719         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1720         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1721         ofstream fastaFile;
1722         m->openOutputFile(fastaFileName, fastaFile);
1723         
1724         vector<string> names(numOTUs, "");
1725         
1726         for(int i=0;i<numOTUs;i++){
1727             
1728             if (m->control_pressed) { break; }
1729             
1730             int index = centroids[i];
1731             
1732             if(otuCounts[i] > 0){
1733                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1734                 
1735                 string newSeq = "";
1736                 
1737                 for(int j=0;j<numFlowCells;j++){
1738                     
1739                     char base = flowOrder[j % 4];
1740                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1741                         newSeq += base;
1742                     }
1743                 }
1744                 
1745                 fastaFile << newSeq.substr(4) << endl;
1746             }
1747         }
1748         fastaFile.close();
1749         
1750         outputNames.push_back(fastaFileName);
1751         
1752         if(compositeFASTAFileName != ""){
1753             m->appendFiles(fastaFileName, compositeFASTAFileName);
1754         }
1755     }
1756     catch(exception& e) {
1757         m->errorOut(e, "ShhherCommand", "writeSequences");
1758         exit(1);        
1759     }           
1760 }
1761
1762 /**************************************************************************************************/
1763
1764 void ShhherCommand::writeNames(vector<int> otuCounts){
1765     try {
1766         string thisOutputDir = outputDir;
1767         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1768         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1769         ofstream nameFile;
1770         m->openOutputFile(nameFileName, nameFile);
1771         
1772         for(int i=0;i<numOTUs;i++){
1773             
1774             if (m->control_pressed) { break; }
1775             
1776             if(otuCounts[i] > 0){
1777                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1778                 
1779                 for(int j=1;j<nSeqsPerOTU[i];j++){
1780                     nameFile << ',' << seqNameVector[aaI[i][j]];
1781                 }
1782                 
1783                 nameFile << endl;
1784             }
1785         }
1786         nameFile.close();
1787         outputNames.push_back(nameFileName);
1788         
1789         
1790         if(compositeNamesFileName != ""){
1791             m->appendFiles(nameFileName, compositeNamesFileName);
1792         }               
1793     }
1794     catch(exception& e) {
1795         m->errorOut(e, "ShhherCommand", "writeNames");
1796         exit(1);        
1797     }           
1798 }
1799
1800 /**************************************************************************************************/
1801
1802 void ShhherCommand::writeGroups(){
1803     try {
1804         string thisOutputDir = outputDir;
1805         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1806         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1807         string groupFileName = fileRoot + "shhh.groups";
1808         ofstream groupFile;
1809         m->openOutputFile(groupFileName, groupFile);
1810         
1811         for(int i=0;i<numSeqs;i++){
1812             if (m->control_pressed) { break; }
1813             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1814         }
1815         groupFile.close();
1816         outputNames.push_back(groupFileName);
1817         
1818     }
1819     catch(exception& e) {
1820         m->errorOut(e, "ShhherCommand", "writeGroups");
1821         exit(1);        
1822     }           
1823 }
1824
1825 /**************************************************************************************************/
1826
1827 void ShhherCommand::writeClusters(vector<int> otuCounts){
1828     try {
1829         string thisOutputDir = outputDir;
1830         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1831         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1832         ofstream otuCountsFile;
1833         m->openOutputFile(otuCountsFileName, otuCountsFile);
1834         
1835         string bases = flowOrder;
1836         
1837         for(int i=0;i<numOTUs;i++){
1838             
1839             if (m->control_pressed) {
1840                 break;
1841             }
1842             //output the translated version of the centroid sequence for the otu
1843             if(otuCounts[i] > 0){
1844                 int index = centroids[i];
1845                 
1846                 otuCountsFile << "ideal\t";
1847                 for(int j=8;j<numFlowCells;j++){
1848                     char base = bases[j % 4];
1849                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1850                         otuCountsFile << base;
1851                     }
1852                 }
1853                 otuCountsFile << endl;
1854                 
1855                 for(int j=0;j<nSeqsPerOTU[i];j++){
1856                     int sequence = aaI[i][j];
1857                     otuCountsFile << seqNameVector[sequence] << '\t';
1858                     
1859                     string newSeq = "";
1860                     
1861                     for(int k=0;k<lengths[sequence];k++){
1862                         char base = bases[k % 4];
1863                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1864                         
1865                         for(int s=0;s<freq;s++){
1866                             newSeq += base;
1867                             //otuCountsFile << base;
1868                         }
1869                     }
1870                     otuCountsFile << newSeq.substr(4) << endl;
1871                 }
1872                 otuCountsFile << endl;
1873             }
1874         }
1875         otuCountsFile.close();
1876         outputNames.push_back(otuCountsFileName);
1877         
1878     }
1879     catch(exception& e) {
1880         m->errorOut(e, "ShhherCommand", "writeClusters");
1881         exit(1);        
1882     }           
1883 }
1884
1885 #else
1886 //**********************************************************************************************************************
1887
1888 int ShhherCommand::execute(){
1889         try {
1890                 if (abort == true) { return 0; }
1891                 
1892                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1893                 getJointLookUp();       if (m->control_pressed) { return 0; }
1894                 
1895         int numFiles = flowFileVector.size();
1896                 
1897         if (numFiles < processors) { processors = numFiles; }
1898         
1899 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1900         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1901         else { createProcesses(flowFileVector); } //each processor processes one file
1902 #else
1903         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1904 #endif
1905         
1906                 if(compositeFASTAFileName != ""){
1907                         outputNames.push_back(compositeFASTAFileName);
1908                         outputNames.push_back(compositeNamesFileName);
1909                 }
1910
1911                 m->mothurOutEndLine();
1912                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1913                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1914                 m->mothurOutEndLine();
1915                 
1916                 return 0;
1917         }
1918         catch(exception& e) {
1919                 m->errorOut(e, "ShhherCommand", "execute");
1920                 exit(1);
1921         }
1922 }
1923 #endif
1924 /**************************************************************************************************/
1925
1926 int ShhherCommand::createProcesses(vector<string> filenames){
1927     try {
1928         vector<int> processIDS;
1929                 int process = 1;
1930                 int num = 0;
1931                 
1932                 //sanity check
1933                 if (filenames.size() < processors) { processors = filenames.size(); }
1934                 
1935                 //divide the groups between the processors
1936                 vector<linePair> lines;
1937                 int numFilesPerProcessor = filenames.size() / processors;
1938                 for (int i = 0; i < processors; i++) {
1939                         int startIndex =  i * numFilesPerProcessor;
1940                         int endIndex = (i+1) * numFilesPerProcessor;
1941                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1942                         lines.push_back(linePair(startIndex, endIndex));
1943                 }
1944                 
1945         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1946                 
1947                 //loop through and create all the processes you want
1948                 while (process != processors) {
1949                         int pid = fork();
1950                         
1951                         if (pid > 0) {
1952                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1953                                 process++;
1954                         }else if (pid == 0){
1955                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1956                                 exit(0);
1957                         }else { 
1958                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1959                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1960                                 exit(0);
1961                         }
1962                 }
1963                 
1964                 //do my part
1965                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1966                 
1967                 //force parent to wait until all the processes are done
1968                 for (int i=0;i<processIDS.size();i++) { 
1969                         int temp = processIDS[i];
1970                         wait(&temp);
1971                 }
1972         
1973         #else
1974         
1975         //////////////////////////////////////////////////////////////////////////////////////////////////////
1976         
1977         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1978         
1979         //////////////////////////////////////////////////////////////////////////////////////////////////////
1980                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1981                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1982                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1983                 
1984                 vector<shhhFlowsData*> pDataArray; 
1985                 DWORD   dwThreadIdArray[processors-1];
1986                 HANDLE  hThreadArray[processors-1]; 
1987                 
1988                 //Create processor worker threads.
1989                 for( int i=0; i<processors-1; i++ ){
1990                         // Allocate memory for thread data.
1991                         string extension = "";
1992                         if (i != 0) { extension = toString(i) + ".temp"; }
1993                         
1994             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1995                         pDataArray.push_back(tempFlow);
1996                         processIDS.push_back(i);
1997             
1998                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1999                 }
2000                 
2001                 //using the main process as a worker saves time and memory
2002                 //do my part
2003                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2004                 
2005                 //Wait until all threads have terminated.
2006                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2007                 
2008                 //Close all thread handles and free memory allocations.
2009                 for(int i=0; i < pDataArray.size(); i++){
2010                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2011                         CloseHandle(hThreadArray[i]);
2012                         delete pDataArray[i];
2013                 }
2014                 
2015         #endif
2016         
2017         for (int i=0;i<processIDS.size();i++) { 
2018             if (compositeFASTAFileName != "") {
2019                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2020                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2021                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2022                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2023             }
2024         }
2025         
2026         return 0;
2027         
2028     }
2029         catch(exception& e) {
2030                 m->errorOut(e, "ShhherCommand", "createProcesses");
2031                 exit(1);
2032         }
2033 }
2034 /**************************************************************************************************/
2035
2036 vector<string> ShhherCommand::parseFlowFiles(string filename){
2037     try {
2038         vector<string> files;
2039         int count = 0;
2040         
2041         ifstream in;
2042         m->openInputFile(filename, in);
2043         
2044         int thisNumFLows = 0;
2045         in >> thisNumFLows; m->gobble(in);
2046         
2047         while (!in.eof()) {
2048             if (m->control_pressed) { break; }
2049             
2050             ofstream out;
2051             string outputFileName = filename + toString(count) + ".temp";
2052             m->openOutputFile(outputFileName, out);
2053             out << thisNumFLows << endl;
2054             files.push_back(outputFileName);
2055             
2056             int numLinesWrote = 0;
2057             for (int i = 0; i < largeSize; i++) {
2058                 if (in.eof()) { break; }
2059                 string line = m->getline(in); m->gobble(in);
2060                 out << line << endl;
2061                 numLinesWrote++;
2062             }
2063             out.close();
2064             
2065             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2066             count++;
2067         }
2068         in.close();
2069         
2070         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2071         
2072         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2073         
2074         return files;
2075     }
2076         catch(exception& e) {
2077                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2078                 exit(1);
2079         }
2080 }
2081 /**************************************************************************************************/
2082
2083 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2084     try {
2085         
2086         for(int i=start;i<end;i++){
2087                         
2088                         if (m->control_pressed) { break; }
2089                         
2090             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2091             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2092             
2093             if (m->control_pressed) { break; }
2094             
2095             double begClock = clock();
2096             unsigned long long begTime;
2097             
2098             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2099                 
2100                 string flowFileName = theseFlowFileNames[g];
2101                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2102                 m->mothurOut("Reading flowgrams...\n");
2103                 
2104                 vector<string> seqNameVector;
2105                 vector<int> lengths;
2106                 vector<short> flowDataIntI;
2107                 vector<double> flowDataPrI;
2108                 map<string, int> nameMap;
2109                 vector<short> uniqueFlowgrams;
2110                 vector<int> uniqueCount;
2111                 vector<int> mapSeqToUnique;
2112                 vector<int> mapUniqueToSeq;
2113                 vector<int> uniqueLengths;
2114                 int numFlowCells;
2115                 
2116                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2117                 
2118                 if (m->control_pressed) { break; }
2119                 
2120                 m->mothurOut("Identifying unique flowgrams...\n");
2121                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2122                 
2123                 if (m->control_pressed) { break; }
2124                 
2125                 m->mothurOut("Calculating distances between flowgrams...\n");
2126                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2127                 begTime = time(NULL);
2128                
2129                 
2130                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2131                 
2132                 m->mothurOutEndLine();
2133                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2134                 
2135                 
2136                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2137                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2138                 
2139                 if (m->control_pressed) { break; }
2140                 
2141                 m->mothurOut("\nClustering flowgrams...\n");
2142                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2143                 cluster(listFileName, distFileName, namesFileName);
2144                 
2145                 if (m->control_pressed) { break; }
2146                 
2147                 vector<int> otuData;
2148                 vector<int> cumNumSeqs;
2149                 vector<int> nSeqsPerOTU;
2150                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2151                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2152                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2153                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2154                 
2155                 
2156                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2157                 
2158                 if (m->control_pressed) { break; }
2159                 
2160                 m->mothurRemove(distFileName);
2161                 m->mothurRemove(namesFileName);
2162                 m->mothurRemove(listFileName);
2163                 
2164                 vector<double> dist;            //adDist - distance of sequences to centroids
2165                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2166                 vector<int> centroids;          //the representative flowgram for each cluster m
2167                 vector<double> weight;
2168                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2169                 vector<int> nSeqsBreaks;
2170                 vector<int> nOTUsBreaks;
2171                 
2172                 dist.assign(numSeqs * numOTUs, 0);
2173                 change.assign(numOTUs, 1);
2174                 centroids.assign(numOTUs, -1);
2175                 weight.assign(numOTUs, 0);
2176                 singleTau.assign(numSeqs, 1.0);
2177                 
2178                 nSeqsBreaks.assign(2, 0);
2179                 nOTUsBreaks.assign(2, 0);
2180                 
2181                 nSeqsBreaks[0] = 0;
2182                 nSeqsBreaks[1] = numSeqs;
2183                 nOTUsBreaks[1] = numOTUs;
2184                 
2185                 if (m->control_pressed) { break; }
2186                 
2187                 double maxDelta = 0;
2188                 int iter = 0;
2189                 
2190                 begClock = clock();
2191                 begTime = time(NULL);
2192                 
2193                 m->mothurOut("\nDenoising flowgrams...\n");
2194                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2195                 
2196                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2197                     
2198                     if (m->control_pressed) { break; }
2199                     
2200                     double cycClock = clock();
2201                     unsigned long long cycTime = time(NULL);
2202                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2203                     
2204                     if (m->control_pressed) { break; }
2205                     
2206                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2207                     
2208                     if (m->control_pressed) { break; }
2209                     
2210                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2211                     
2212                     if (m->control_pressed) { break; }
2213                     
2214                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2215                     
2216                     if (m->control_pressed) { break; }
2217                     
2218                     checkCentroids(numOTUs, centroids, weight);
2219                     
2220                     if (m->control_pressed) { break; }
2221                     
2222                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2223                     
2224                     if (m->control_pressed) { break; }
2225                     
2226                     iter++;
2227                     
2228                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2229                     
2230                 }       
2231                 
2232                 if (m->control_pressed) { break; }
2233                 
2234                 m->mothurOut("\nFinalizing...\n");
2235                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2236                 
2237                 if (m->control_pressed) { break; }
2238                 
2239                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2240                 
2241                 if (m->control_pressed) { break; }
2242                 
2243                 vector<int> otuCounts(numOTUs, 0);
2244                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2245                 
2246                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2247                 
2248                 if (m->control_pressed) { break; }
2249                 
2250                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2251                 string thisOutputDir = outputDir;
2252                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2253                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2254                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2255                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2256                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2257                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2258                 string groupFileName = fileRoot + "shhh.groups";
2259
2260                 
2261                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2262                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2263                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2264                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2265                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2266                 
2267                 if (large) {
2268                     if (g > 0) {
2269                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2270                         m->mothurRemove(qualityFileName);
2271                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2272                         m->mothurRemove(fastaFileName);
2273                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2274                         m->mothurRemove(nameFileName);
2275                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2276                         m->mothurRemove(otuCountsFileName);
2277                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2278                         m->mothurRemove(groupFileName);
2279                     }
2280                     m->mothurRemove(theseFlowFileNames[g]);
2281                 }
2282                         }
2283             
2284                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2285                 }
2286                 
2287         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2288         
2289         return 0;
2290         
2291     }catch(exception& e) {
2292             m->errorOut(e, "ShhherCommand", "driver");
2293             exit(1);
2294     }
2295 }
2296
2297 /**************************************************************************************************/
2298 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2299         try{
2300        
2301                 ifstream flowFile;
2302        
2303                 m->openInputFile(filename, flowFile);
2304                 
2305                 string seqName;
2306                 int currentNumFlowCells;
2307                 float intensity;
2308         thisSeqNameVector.clear();
2309                 thisLengths.clear();
2310                 thisFlowDataIntI.clear();
2311                 thisNameMap.clear();
2312                 
2313                 flowFile >> numFlowCells;
2314                 int index = 0;//pcluster
2315                 while(!flowFile.eof()){
2316                         
2317                         if (m->control_pressed) { break; }
2318                         
2319                         flowFile >> seqName >> currentNumFlowCells;
2320                         thisLengths.push_back(currentNumFlowCells);
2321            
2322                         thisSeqNameVector.push_back(seqName);
2323                         thisNameMap[seqName] = index++;//pcluster
2324
2325                         for(int i=0;i<numFlowCells;i++){
2326                                 flowFile >> intensity;
2327                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2328                                 int intI = int(100 * intensity + 0.0001);
2329                                 thisFlowDataIntI.push_back(intI);
2330                         }
2331                         m->gobble(flowFile);
2332                 }
2333                 flowFile.close();
2334                 
2335                 int numSeqs = thisSeqNameVector.size();         
2336                 
2337                 for(int i=0;i<numSeqs;i++){
2338                         
2339                         if (m->control_pressed) { break; }
2340                         
2341                         int iNumFlowCells = i * numFlowCells;
2342                         for(int j=thisLengths[i];j<numFlowCells;j++){
2343                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2344                         }
2345                 }
2346         
2347         return numSeqs;
2348                 
2349         }
2350         catch(exception& e) {
2351                 m->errorOut(e, "ShhherCommand", "getFlowData");
2352                 exit(1);
2353         }
2354 }
2355 /**************************************************************************************************/
2356
2357 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2358         try{            
2359         
2360                 ostringstream outStream;
2361                 outStream.setf(ios::fixed, ios::floatfield);
2362                 outStream.setf(ios::dec, ios::basefield);
2363                 outStream.setf(ios::showpoint);
2364                 outStream.precision(6);
2365                 
2366                 int begTime = time(NULL);
2367                 double begClock = clock();
2368         
2369                 for(int i=0;i<stopSeq;i++){
2370                         
2371                         if (m->control_pressed) { break; }
2372                         
2373                         for(int j=0;j<i;j++){
2374                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2375                 
2376                                 if(flowDistance < 1e-6){
2377                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2378                                 }
2379                                 else if(flowDistance <= cutoff){
2380                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2381                                 }
2382                         }
2383                         if(i % 100 == 0){
2384                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2385                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2386                                 m->mothurOutEndLine();
2387                         }
2388                 }
2389                 
2390                 ofstream distFile(distFileName.c_str());
2391                 distFile << outStream.str();            
2392                 distFile.close();
2393                 
2394                 if (m->control_pressed) {}
2395                 else {
2396                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2397                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2398                         m->mothurOutEndLine();
2399                 }
2400         
2401         return 0;
2402         }
2403         catch(exception& e) {
2404                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2405                 exit(1);
2406         }
2407 }
2408 /**************************************************************************************************/
2409
2410 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2411         try{
2412                 int minLength = lengths[mapSeqToUnique[seqA]];
2413                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2414                 
2415                 int ANumFlowCells = seqA * numFlowCells;
2416                 int BNumFlowCells = seqB * numFlowCells;
2417                 
2418                 float dist = 0;
2419                 
2420                 for(int i=0;i<minLength;i++){
2421                         
2422                         if (m->control_pressed) { break; }
2423                         
2424                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2425                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2426                         
2427                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2428                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2429                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2430                 }
2431                 
2432                 dist /= (float) minLength;
2433                 return dist;
2434         }
2435         catch(exception& e) {
2436                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2437                 exit(1);
2438         }
2439 }
2440
2441 /**************************************************************************************************/
2442
2443 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2444         try{
2445                 int numUniques = 0;
2446                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2447                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2448                 uniqueLengths.assign(numSeqs, 0);
2449                 mapSeqToUnique.assign(numSeqs, -1);
2450                 mapUniqueToSeq.assign(numSeqs, -1);
2451                 
2452                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2453                 
2454                 for(int i=0;i<numSeqs;i++){
2455                         
2456                         if (m->control_pressed) { break; }
2457                         
2458                         int index = 0;
2459                         
2460                         vector<short> current(numFlowCells);
2461                         for(int j=0;j<numFlowCells;j++){
2462                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2463                         }
2464             
2465                         for(int j=0;j<numUniques;j++){
2466                                 int offset = j * numFlowCells;
2467                                 bool toEnd = 1;
2468                                 
2469                                 int shorterLength;
2470                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2471                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2472                 
2473                                 for(int k=0;k<shorterLength;k++){
2474                                         if(current[k] != uniqueFlowgrams[offset + k]){
2475                                                 toEnd = 0;
2476                                                 break;
2477                                         }
2478                                 }
2479                                 
2480                                 if(toEnd){
2481                                         mapSeqToUnique[i] = j;
2482                                         uniqueCount[j]++;
2483                                         index = j;
2484                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2485                                         break;
2486                                 }
2487                                 index++;
2488                         }
2489                         
2490                         if(index == numUniques){
2491                                 uniqueLengths[numUniques] = lengths[i];
2492                                 uniqueCount[numUniques] = 1;
2493                                 mapSeqToUnique[i] = numUniques;//anMap
2494                                 mapUniqueToSeq[numUniques] = i;//anF
2495                                 
2496                                 for(int k=0;k<numFlowCells;k++){
2497                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2498                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2499                                 }
2500                                 
2501                                 numUniques++;
2502                         }
2503                 }
2504                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2505                 uniqueLengths.resize(numUniques);       
2506                 
2507                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2508                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2509         
2510         return numUniques;
2511         }
2512         catch(exception& e) {
2513                 m->errorOut(e, "ShhherCommand", "getUniques");
2514                 exit(1);
2515         }
2516 }
2517 /**************************************************************************************************/
2518 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2519         try{
2520                 
2521                 vector<string> duplicateNames(numUniques, "");
2522                 for(int i=0;i<numSeqs;i++){
2523                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2524                 }
2525                 
2526                 ofstream nameFile;
2527                 m->openOutputFile(filename, nameFile);
2528                 
2529                 for(int i=0;i<numUniques;i++){
2530                         
2531                         if (m->control_pressed) { break; }
2532                         
2533             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2534                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2535                 }
2536                 
2537                 nameFile.close();
2538         
2539                 return 0;
2540         }
2541         catch(exception& e) {
2542                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2543                 exit(1);
2544         }
2545 }
2546 //**********************************************************************************************************************
2547
2548 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2549         try {
2550                 
2551                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2552                 read->setCutoff(cutoff);
2553                 
2554                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2555                 clusterNameMap->readMap();
2556                 read->read(clusterNameMap);
2557         
2558                 ListVector* list = read->getListVector();
2559                 SparseMatrix* matrix = read->getMatrix();
2560                 
2561                 delete read; 
2562                 delete clusterNameMap; 
2563         
2564                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2565                 
2566                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2567                 string tag = cluster->getTag();
2568                 
2569                 double clusterCutoff = cutoff;
2570                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2571                         
2572                         if (m->control_pressed) { break; }
2573                         
2574                         cluster->update(clusterCutoff);
2575                 }
2576                 
2577                 list->setLabel(toString(cutoff));
2578                 
2579                 ofstream listFile;
2580                 m->openOutputFile(filename, listFile);
2581                 list->print(listFile);
2582                 listFile.close();
2583                 
2584                 delete matrix;  delete cluster; delete rabund; delete list;
2585         
2586                 return 0;
2587         }
2588         catch(exception& e) {
2589                 m->errorOut(e, "ShhherCommand", "cluster");
2590                 exit(1);        
2591         }               
2592 }
2593 /**************************************************************************************************/
2594
2595 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2596                                vector<int>& cumNumSeqs,
2597                                vector<int>& nSeqsPerOTU,
2598                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2599                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2600                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2601                                vector<int>& seqIndex,
2602                                map<string, int>& nameMap){
2603         try {
2604         
2605                 ifstream listFile;
2606                 m->openInputFile(fileName, listFile);
2607                 string label;
2608         int numOTUs;
2609                 
2610                 listFile >> label >> numOTUs;
2611         
2612                 otuData.assign(numSeqs, 0);
2613                 cumNumSeqs.assign(numOTUs, 0);
2614                 nSeqsPerOTU.assign(numOTUs, 0);
2615                 aaP.clear();aaP.resize(numOTUs);
2616                 
2617                 seqNumber.clear();
2618                 aaI.clear();
2619                 seqIndex.clear();
2620                 
2621                 string singleOTU = "";
2622                 
2623                 for(int i=0;i<numOTUs;i++){
2624                         
2625                         if (m->control_pressed) { break; }
2626             
2627                         listFile >> singleOTU;
2628                         
2629                         istringstream otuString(singleOTU);
2630             
2631                         while(otuString){
2632                                 
2633                                 string seqName = "";
2634                                 
2635                                 for(int j=0;j<singleOTU.length();j++){
2636                                         char letter = otuString.get();
2637                                         
2638                                         if(letter != ','){
2639                                                 seqName += letter;
2640                                         }
2641                                         else{
2642                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2643                                                 int index = nmIt->second;
2644                                                 
2645                                                 nameMap.erase(nmIt);
2646                                                 
2647                                                 otuData[index] = i;
2648                                                 nSeqsPerOTU[i]++;
2649                                                 aaP[i].push_back(index);
2650                                                 seqName = "";
2651                                         }
2652                                 }
2653                                 
2654                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2655                 
2656                                 int index = nmIt->second;
2657                                 nameMap.erase(nmIt);
2658                 
2659                                 otuData[index] = i;
2660                                 nSeqsPerOTU[i]++;
2661                                 aaP[i].push_back(index);        
2662                                 
2663                                 otuString.get();
2664                         }
2665                         
2666                         sort(aaP[i].begin(), aaP[i].end());
2667                         for(int j=0;j<nSeqsPerOTU[i];j++){
2668                                 seqNumber.push_back(aaP[i][j]);
2669                         }
2670                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2671                                 aaP[i].push_back(0);
2672                         }
2673                         
2674                         
2675                 }
2676                 
2677                 for(int i=1;i<numOTUs;i++){
2678                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2679                 }
2680                 aaI = aaP;
2681                 seqIndex = seqNumber;
2682                 
2683                 listFile.close();       
2684         
2685         return numOTUs;
2686                 
2687         }
2688         catch(exception& e) {
2689                 m->errorOut(e, "ShhherCommand", "getOTUData");
2690                 exit(1);        
2691         }               
2692 }
2693 /**************************************************************************************************/
2694
2695 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2696                                           vector<int>& cumNumSeqs,
2697                                           vector<int>& nSeqsPerOTU,
2698                                           vector<int>& seqIndex,
2699                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2700                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2701                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2702                                           vector<int>& mapSeqToUnique,
2703                                           vector<short>& uniqueFlowgrams,
2704                                           vector<short>& flowDataIntI,
2705                                           vector<int>& lengths,
2706                                           int numFlowCells,
2707                                           vector<int>& seqNumber){                          
2708         
2709         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2710         //within an otu
2711         
2712         try{
2713                 
2714                 for(int i=0;i<numOTUs;i++){
2715                         
2716                         if (m->control_pressed) { break; }
2717                         
2718                         double count = 0;
2719                         int position = 0;
2720                         int minFlowGram = 100000000;
2721                         double minFlowValue = 1e8;
2722                         change[i] = 0; //FALSE
2723                         
2724                         for(int j=0;j<nSeqsPerOTU[i];j++){
2725                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2726                         }
2727             
2728                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2729                                 vector<double> adF(nSeqsPerOTU[i]);
2730                                 vector<int> anL(nSeqsPerOTU[i]);
2731                                 
2732                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2733                                         int index = cumNumSeqs[i] + j;
2734                                         int nI = seqIndex[index];
2735                                         int nIU = mapSeqToUnique[nI];
2736                                         
2737                                         int k;
2738                                         for(k=0;k<position;k++){
2739                                                 if(nIU == anL[k]){
2740                                                         break;
2741                                                 }
2742                                         }
2743                                         if(k == position){
2744                                                 anL[position] = nIU;
2745                                                 adF[position] = 0.0000;
2746                                                 position++;
2747                                         }                                               
2748                                 }
2749                                 
2750                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2751                                         int index = cumNumSeqs[i] + j;
2752                                         int nI = seqIndex[index];
2753                                         
2754                                         double tauValue = singleTau[seqNumber[index]];
2755                                         
2756                                         for(int k=0;k<position;k++){
2757                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2758                                                 adF[k] += dist * tauValue;
2759                                         }
2760                                 }
2761                                 
2762                                 for(int j=0;j<position;j++){
2763                                         if(adF[j] < minFlowValue){
2764                                                 minFlowGram = j;
2765                                                 minFlowValue = adF[j];
2766                                         }
2767                                 }
2768                                 
2769                                 if(centroids[i] != anL[minFlowGram]){
2770                                         change[i] = 1;
2771                                         centroids[i] = anL[minFlowGram];
2772                                 }
2773                         }
2774                         else if(centroids[i] != -1){
2775                                 change[i] = 1;
2776                                 centroids[i] = -1;                      
2777                         }
2778                 }
2779         
2780         return 0;
2781         }
2782         catch(exception& e) {
2783                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2784                 exit(1);        
2785         }               
2786 }
2787 /**************************************************************************************************/
2788
2789 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2790                                         vector<short>& flowDataIntI, int numFlowCells){
2791         try{
2792                 
2793                 int flowAValue = cent * numFlowCells;
2794                 int flowBValue = flow * numFlowCells;
2795                 
2796                 double dist = 0;
2797         
2798                 for(int i=0;i<length;i++){
2799                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2800                         flowAValue++;
2801                         flowBValue++;
2802                 }
2803                 
2804                 return dist / (double)length;
2805         }
2806         catch(exception& e) {
2807                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2808                 exit(1);        
2809         }               
2810 }
2811 /**************************************************************************************************/
2812
2813 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2814         try{
2815                 
2816                 double maxChange = 0;
2817                 
2818                 for(int i=0;i<numOTUs;i++){
2819                         
2820                         if (m->control_pressed) { break; }
2821                         
2822                         double difference = weight[i];
2823                         weight[i] = 0;
2824                         
2825                         for(int j=0;j<nSeqsPerOTU[i];j++){
2826                                 int index = cumNumSeqs[i] + j;
2827                                 double tauValue = singleTau[seqNumber[index]];
2828                                 weight[i] += tauValue;
2829                         }
2830                         
2831                         difference = fabs(weight[i] - difference);
2832                         if(difference > maxChange){     maxChange = difference; }
2833                 }
2834                 return maxChange;
2835         }
2836         catch(exception& e) {
2837                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2838                 exit(1);        
2839         }               
2840 }
2841
2842 /**************************************************************************************************/
2843
2844 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2845         
2846         try{
2847                 
2848                 vector<long double> P(numSeqs, 0);
2849                 int effNumOTUs = 0;
2850                 
2851                 for(int i=0;i<numOTUs;i++){
2852                         if(weight[i] > MIN_WEIGHT){
2853                                 effNumOTUs++;
2854                         }
2855                 }
2856                 
2857                 string hold;
2858                 for(int i=0;i<numOTUs;i++){
2859                         
2860                         if (m->control_pressed) { break; }
2861                         
2862                         for(int j=0;j<nSeqsPerOTU[i];j++){
2863                                 int index = cumNumSeqs[i] + j;
2864                                 int nI = seqIndex[index];
2865                                 double singleDist = dist[seqNumber[index]];
2866                                 
2867                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2868                         }
2869                 }
2870                 double nLL = 0.00;
2871                 for(int i=0;i<numSeqs;i++){
2872                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2873             
2874                         nLL += -log(P[i]);
2875                 }
2876                 
2877                 nLL = nLL -(double)numSeqs * log(sigma);
2878         
2879                 return nLL; 
2880         }
2881         catch(exception& e) {
2882                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2883                 exit(1);        
2884         }               
2885 }
2886
2887 /**************************************************************************************************/
2888
2889 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2890         try{
2891                 vector<int> unique(numOTUs, 1);
2892                 
2893                 for(int i=0;i<numOTUs;i++){
2894                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2895                                 unique[i] = -1;
2896                         }
2897                 }
2898                 
2899                 for(int i=0;i<numOTUs;i++){
2900                         
2901                         if (m->control_pressed) { break; }
2902                         
2903                         if(unique[i] == 1){
2904                                 for(int j=i+1;j<numOTUs;j++){
2905                                         if(unique[j] == 1){
2906                                                 
2907                                                 if(centroids[j] == centroids[i]){
2908                                                         unique[j] = 0;
2909                                                         centroids[j] = -1;
2910                                                         
2911                                                         weight[i] += weight[j];
2912                                                         weight[j] = 0.0;
2913                                                 }
2914                                         }
2915                                 }
2916                         }
2917                 }
2918         
2919         return 0;
2920         }
2921         catch(exception& e) {
2922                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2923                 exit(1);        
2924         }               
2925 }
2926 /**************************************************************************************************/
2927
2928 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2929                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2930                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2931                                      vector<int>& seqNumber, vector<int>& seqIndex,
2932                                      vector<short>& uniqueFlowgrams,
2933                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2934         
2935         try{
2936                 
2937                 int total = 0;
2938                 vector<double> newTau(numOTUs,0);
2939                 vector<double> norms(numSeqs, 0);
2940                 nSeqsPerOTU.assign(numOTUs, 0);
2941         
2942                 for(int i=0;i<numSeqs;i++){
2943                         
2944                         if (m->control_pressed) { break; }
2945                         
2946                         int indexOffset = i * numOTUs;
2947             
2948                         double offset = 1e8;
2949                         
2950                         for(int j=0;j<numOTUs;j++){
2951                 
2952                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2953                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2954                                 }
2955                 
2956                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2957                                         offset = dist[indexOffset + j];
2958                                 }
2959                         }
2960             
2961                         for(int j=0;j<numOTUs;j++){
2962                                 if(weight[j] > MIN_WEIGHT){
2963                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2964                                         norms[i] += newTau[j];
2965                                 }
2966                                 else{
2967                                         newTau[j] = 0.0;
2968                                 }
2969                         }
2970             
2971                         for(int j=0;j<numOTUs;j++){
2972                                 newTau[j] /= norms[i];
2973                         }
2974             
2975                         for(int j=0;j<numOTUs;j++){
2976                                 if(newTau[j] > MIN_TAU){
2977                                         
2978                                         int oldTotal = total;
2979                                         
2980                                         total++;
2981                                         
2982                                         singleTau.resize(total, 0);
2983                                         seqNumber.resize(total, 0);
2984                                         seqIndex.resize(total, 0);
2985                                         
2986                                         singleTau[oldTotal] = newTau[j];
2987                                         
2988                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2989                                         aaI[j][nSeqsPerOTU[j]] = i;
2990                                         nSeqsPerOTU[j]++;
2991                                 }
2992                         }
2993             
2994                 }
2995         
2996         }
2997         catch(exception& e) {
2998                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2999                 exit(1);        
3000         }               
3001 }
3002 /**************************************************************************************************/
3003
3004 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3005         try {
3006                 int index = 0;
3007                 for(int i=0;i<numOTUs;i++){
3008                         
3009                         if (m->control_pressed) { return 0; }
3010                         
3011                         cumNumSeqs[i] = index;
3012                         for(int j=0;j<nSeqsPerOTU[i];j++){
3013                                 seqNumber[index] = aaP[i][j];
3014                                 seqIndex[index] = aaI[i][j];
3015                                 
3016                                 index++;
3017                         }
3018                 }
3019         
3020         return 0; 
3021         }
3022         catch(exception& e) {
3023                 m->errorOut(e, "ShhherCommand", "fill");
3024                 exit(1);        
3025         }               
3026 }
3027 /**************************************************************************************************/
3028
3029 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3030                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3031         
3032         try {
3033                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3034                 
3035                 for(int i=0;i<numOTUs;i++){
3036                         
3037                         if (m->control_pressed) { break; }
3038                         
3039                         for(int j=0;j<nSeqsPerOTU[i];j++){
3040                                 int index = cumNumSeqs[i] + j;
3041                                 double tauValue = singleTau[seqNumber[index]];
3042                                 int sIndex = seqIndex[index];
3043                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3044                         }
3045                 }
3046                 
3047                 for(int i=0;i<numSeqs;i++){
3048                         double maxTau = -1.0000;
3049                         int maxOTU = -1;
3050                         for(int j=0;j<numOTUs;j++){
3051                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3052                                         maxTau = bigTauMatrix[i * numOTUs + j];
3053                                         maxOTU = j;
3054                                 }
3055                         }
3056                         
3057                         otuData[i] = maxOTU;
3058                 }
3059                 
3060                 nSeqsPerOTU.assign(numOTUs, 0);         
3061                 
3062                 for(int i=0;i<numSeqs;i++){
3063                         int index = otuData[i];
3064                         
3065                         singleTau[i] = 1.0000;
3066                         dist[i] = 0.0000;
3067                         
3068                         aaP[index][nSeqsPerOTU[index]] = i;
3069                         aaI[index][nSeqsPerOTU[index]] = i;
3070                         
3071                         nSeqsPerOTU[index]++;
3072                 }
3073         
3074                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3075         }
3076         catch(exception& e) {
3077                 m->errorOut(e, "ShhherCommand", "setOTUs");
3078                 exit(1);        
3079         }               
3080 }
3081 /**************************************************************************************************/
3082
3083 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3084                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3085                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3086         
3087         try {
3088         
3089                 ofstream qualityFile;
3090                 m->openOutputFile(qualityFileName, qualityFile);
3091         
3092                 qualityFile.setf(ios::fixed, ios::floatfield);
3093                 qualityFile.setf(ios::showpoint);
3094                 qualityFile << setprecision(6);
3095                 
3096                 vector<vector<int> > qualities(numOTUs);
3097                 vector<double> pr(HOMOPS, 0);
3098                 
3099                 
3100                 for(int i=0;i<numOTUs;i++){
3101                         
3102                         if (m->control_pressed) { break; }
3103                         
3104                         int index = 0;
3105                         int base = 0;
3106                         
3107                         if(nSeqsPerOTU[i] > 0){
3108                                 qualities[i].assign(1024, -1);
3109                                 
3110                                 while(index < numFlowCells){
3111                                         double maxPrValue = 1e8;
3112                                         short maxPrIndex = -1;
3113                                         double count = 0.0000;
3114                                         
3115                                         pr.assign(HOMOPS, 0);
3116                                         
3117                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3118                                                 int lIndex = cumNumSeqs[i] + j;
3119                                                 double tauValue = singleTau[seqNumber[lIndex]];
3120                                                 int sequenceIndex = aaI[i][j];
3121                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3122                                                 
3123                                                 count += tauValue;
3124                                                 
3125                                                 for(int s=0;s<HOMOPS;s++){
3126                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3127                                                 }
3128                                         }
3129                                         
3130                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3131                                         maxPrValue = pr[maxPrIndex];
3132                                         
3133                                         if(count > MIN_COUNT){
3134                                                 double U = 0.0000;
3135                                                 double norm = 0.0000;
3136                                                 
3137                                                 for(int s=0;s<HOMOPS;s++){
3138                                                         norm += exp(-(pr[s] - maxPrValue));
3139                                                 }
3140                                                 
3141                                                 for(int s=1;s<=maxPrIndex;s++){
3142                                                         int value = 0;
3143                                                         double temp = 0.0000;
3144                                                         
3145                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3146                                                         
3147                                                         if(U>0.00){
3148                                                                 temp = log10(U);
3149                                                         }
3150                                                         else{
3151                                                                 temp = -10.1;
3152                                                         }
3153                                                         temp = floor(-10 * temp);
3154                                                         value = (int)floor(temp);
3155                                                         if(value > 100){        value = 100;    }
3156                                                         
3157                                                         qualities[i][base] = (int)value;
3158                                                         base++;
3159                                                 }
3160                                         }
3161                                         
3162                                         index++;
3163                                 }
3164                         }
3165                         
3166                         
3167                         if(otuCounts[i] > 0){
3168                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3169                         
3170                                 int j=4;        //need to get past the first four bases
3171                                 while(qualities[i][j] != -1){
3172                     qualityFile << qualities[i][j] << ' ';
3173                     if (j > qualities[i].size()) { break; }
3174                     j++;
3175                                 }
3176                                 qualityFile << endl;
3177                         }
3178                 }
3179                 qualityFile.close();
3180                 outputNames.push_back(qualityFileName);
3181         
3182         }
3183         catch(exception& e) {
3184                 m->errorOut(e, "ShhherCommand", "writeQualities");
3185                 exit(1);        
3186         }               
3187 }
3188
3189 /**************************************************************************************************/
3190
3191 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3192         try {
3193                 
3194                 ofstream fastaFile;
3195                 m->openOutputFile(fastaFileName, fastaFile);
3196                 
3197                 vector<string> names(numOTUs, "");
3198                 
3199                 for(int i=0;i<numOTUs;i++){
3200                         
3201                         if (m->control_pressed) { break; }
3202                         
3203                         int index = centroids[i];
3204                         
3205                         if(otuCounts[i] > 0){
3206                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3207                                 
3208                                 string newSeq = "";
3209                                 
3210                                 for(int j=0;j<numFlowCells;j++){
3211                                         
3212                                         char base = flowOrder[j % 4];
3213                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3214                                                 newSeq += base;
3215                                         }
3216                                 }
3217                                 
3218                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3219                 else {  fastaFile << "NNNN" << endl;  }
3220                         }
3221                 }
3222                 fastaFile.close();
3223         
3224                 outputNames.push_back(fastaFileName);
3225         
3226                 if(thisCompositeFASTAFileName != ""){
3227                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3228                 }
3229         }
3230         catch(exception& e) {
3231                 m->errorOut(e, "ShhherCommand", "writeSequences");
3232                 exit(1);        
3233         }               
3234 }
3235
3236 /**************************************************************************************************/
3237
3238 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3239         try {
3240                 
3241                 ofstream nameFile;
3242                 m->openOutputFile(nameFileName, nameFile);
3243                 
3244                 for(int i=0;i<numOTUs;i++){
3245                         
3246                         if (m->control_pressed) { break; }
3247                         
3248                         if(otuCounts[i] > 0){
3249                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3250                                 
3251                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3252                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3253                                 }
3254                                 
3255                                 nameFile << endl;
3256                         }
3257                 }
3258                 nameFile.close();
3259                 outputNames.push_back(nameFileName);
3260                 
3261                 
3262                 if(thisCompositeNamesFileName != ""){
3263                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3264                 }               
3265         }
3266         catch(exception& e) {
3267                 m->errorOut(e, "ShhherCommand", "writeNames");
3268                 exit(1);        
3269         }               
3270 }
3271
3272 /**************************************************************************************************/
3273
3274 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3275         try {
3276         ofstream groupFile;
3277                 m->openOutputFile(groupFileName, groupFile);
3278                 
3279                 for(int i=0;i<numSeqs;i++){
3280                         if (m->control_pressed) { break; }
3281                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3282                 }
3283                 groupFile.close();
3284                 outputNames.push_back(groupFileName);
3285         
3286         }
3287         catch(exception& e) {
3288                 m->errorOut(e, "ShhherCommand", "writeGroups");
3289                 exit(1);        
3290         }               
3291 }
3292
3293 /**************************************************************************************************/
3294
3295 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3296         try {
3297                 ofstream otuCountsFile;
3298                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3299                 
3300                 string bases = flowOrder;
3301                 
3302                 for(int i=0;i<numOTUs;i++){
3303                         
3304                         if (m->control_pressed) {
3305                                 break;
3306                         }
3307                         //output the translated version of the centroid sequence for the otu
3308                         if(otuCounts[i] > 0){
3309                                 int index = centroids[i];
3310                                 
3311                                 otuCountsFile << "ideal\t";
3312                                 for(int j=8;j<numFlowCells;j++){
3313                                         char base = bases[j % 4];
3314                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3315                                                 otuCountsFile << base;
3316                                         }
3317                                 }
3318                                 otuCountsFile << endl;
3319                                 
3320                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3321                                         int sequence = aaI[i][j];
3322                                         otuCountsFile << seqNameVector[sequence] << '\t';
3323                                         
3324                                         string newSeq = "";
3325                                         
3326                                         for(int k=0;k<lengths[sequence];k++){
3327                                                 char base = bases[k % 4];
3328                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3329                         
3330                                                 for(int s=0;s<freq;s++){
3331                                                         newSeq += base;
3332                                                         //otuCountsFile << base;
3333                                                 }
3334                                         }
3335                                         
3336                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3337                     else {  otuCountsFile << "NNNN" << endl;  }
3338                                 }
3339                                 otuCountsFile << endl;
3340                         }
3341                 }
3342                 otuCountsFile.close();
3343                 outputNames.push_back(otuCountsFileName);
3344         
3345         }
3346         catch(exception& e) {
3347                 m->errorOut(e, "ShhherCommand", "writeClusters");
3348                 exit(1);        
3349         }               
3350 }
3351
3352 /**************************************************************************************************/
3353
3354 void ShhherCommand::getSingleLookUp(){
3355         try{
3356                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3357                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3358                 
3359                 int index = 0;
3360                 ifstream lookUpFile;
3361                 m->openInputFile(lookupFileName, lookUpFile);
3362                 
3363                 for(int i=0;i<HOMOPS;i++){
3364                         
3365                         if (m->control_pressed) { break; }
3366                         
3367                         float logFracFreq;
3368                         lookUpFile >> logFracFreq;
3369                         
3370                         for(int j=0;j<NUMBINS;j++)      {
3371                                 lookUpFile >> singleLookUp[index];
3372                                 index++;
3373                         }
3374                 }       
3375                 lookUpFile.close();
3376         }
3377         catch(exception& e) {
3378                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3379                 exit(1);
3380         }
3381 }
3382
3383 /**************************************************************************************************/
3384
3385 void ShhherCommand::getJointLookUp(){
3386         try{
3387                 
3388                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3389                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3390                 
3391                 for(int i=0;i<NUMBINS;i++){
3392                         
3393                         if (m->control_pressed) { break; }
3394                         
3395                         for(int j=0;j<NUMBINS;j++){             
3396                                 
3397                                 double minSum = 100000000;
3398                                 
3399                                 for(int k=0;k<HOMOPS;k++){
3400                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3401                                         
3402                                         if(sum < minSum)        {       minSum = sum;           }
3403                                 }       
3404                                 jointLookUp[i * NUMBINS + j] = minSum;
3405                         }
3406                 }
3407         }
3408         catch(exception& e) {
3409                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3410                 exit(1);
3411         }
3412 }
3413
3414 /**************************************************************************************************/
3415
3416 double ShhherCommand::getProbIntensity(int intIntensity){                          
3417         try{
3418
3419                 double minNegLogProb = 100000000; 
3420
3421                 
3422                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3423                         
3424                         if (m->control_pressed) { break; }
3425                         
3426                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3427                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3428                 }
3429                 
3430                 return minNegLogProb;
3431         }
3432         catch(exception& e) {
3433                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3434                 exit(1);
3435         }
3436 }
3437
3438
3439
3440