]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added design and sets parameters to the rarefaction.shared command. finished work...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50
51 ShhherCommand::ShhherCommand(){ 
52         try {
53                 abort = true; calledHelp = true;
54                 setParameters();
55                 
56                 //initialize outputTypes
57 //              vector<string> tempOutNames;
58 //              outputTypes["pn.dist"] = tempOutNames;
59
60         }
61         catch(exception& e) {
62                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
63                 exit(1);
64         }
65 }
66
67 //**********************************************************************************************************************
68
69 ShhherCommand::ShhherCommand(string option) {
70         try {
71         
72 #ifdef USE_MPI
73                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
74                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
75         
76                 if(pid == 0){
77 #endif
78                 abort = false; calledHelp = false;   
79                 
80                 //allow user to run help
81                 if(option == "help") { help(); abort = true; calledHelp = true; }
82                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
83                 
84                 else {
85                         vector<string> myArray = setParameters();
86                         
87                         OptionParser parser(option);
88                         map<string,string> parameters = parser.getParameters();
89                         
90                         ValidParameters validParameter;
91                         map<string,string>::iterator it;
92                         
93                         //check to make sure all parameters are valid for command
94                         for (it = parameters.begin(); it != parameters.end(); it++) { 
95                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
96                         }
97                         
98                         //initialize outputTypes
99                         vector<string> tempOutNames;
100 //                      outputTypes["pn.dist"] = tempOutNames;
101                         //                      outputTypes["fasta"] = tempOutNames;
102                         
103                         //if the user changes the input directory command factory will send this info to us in the output parameter 
104                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
105                         if (inputDir == "not found"){   inputDir = "";          }
106                         else {
107                                 string path;
108                                 it = parameters.find("flow");
109                                 //user has given a template file
110                                 if(it != parameters.end()){ 
111                                         path = m->hasPath(it->second);
112                                         //if the user has not given a path then, add inputdir. else leave path alone.
113                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
114                                 }
115                                 
116                                 it = parameters.find("lookup");
117                                 //user has given a template file
118                                 if(it != parameters.end()){ 
119                                         path = m->hasPath(it->second);
120                                         //if the user has not given a path then, add inputdir. else leave path alone.
121                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
122                                 }
123
124                                 it = parameters.find("file");
125                                 //user has given a template file
126                                 if(it != parameters.end()){ 
127                                         path = m->hasPath(it->second);
128                                         //if the user has not given a path then, add inputdir. else leave path alone.
129                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
130                                 }
131                         }
132             
133             //if the user changes the output directory command factory will send this info to us in the output parameter 
134                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
135             
136                         //check for required parameters
137                         flowFileName = validParameter.validFile(parameters, "flow", true);
138                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
139                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
140                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
141                                 m->mothurOutEndLine();
142                                 abort = true; 
143                         }
144                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
145                         
146                         if(flowFileName != "not found"){
147                                 compositeFASTAFileName = "";    
148                                 compositeNamesFileName = "";    
149                         }
150                         else{
151                                 ofstream temp;
152                 
153                 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
154                 
155                                 //flow.files = 9 character offset
156                                 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
157                                 m->openOutputFile(compositeFASTAFileName, temp);
158                                 temp.close();
159                                 
160                                 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
161                                 m->openOutputFile(compositeNamesFileName, temp);
162                                 temp.close();
163                         }
164             
165             if(flowFilesFileName != "not found"){
166                 string fName;
167                 
168                 ifstream flowFilesFile;
169                 m->openInputFile(flowFilesFileName, flowFilesFile);
170                 while(flowFilesFile){
171                     fName = m->getline(flowFilesFile);
172                     
173                     //test if file is valid
174                     ifstream in;
175                     int ableToOpen = m->openInputFile(fName, in, "noerror");
176                     in.close(); 
177                     if (ableToOpen == 1) {
178                         if (inputDir != "") { //default path is set
179                             string tryPath = inputDir + fName;
180                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
181                             ifstream in2;
182                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
183                             in2.close();
184                             fName = tryPath;
185                         }
186                     }
187                     
188                     if (ableToOpen == 1) {
189                         if (m->getDefaultPath() != "") { //default path is set
190                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
191                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
192                             ifstream in2;
193                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
194                             in2.close();
195                             fName = tryPath;
196                         }
197                     }
198                     
199                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
200                     if (ableToOpen == 1) {
201                         string exepath = m->argv;
202                         string tempPath = exepath;
203                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
204                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
205                         
206                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
207                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
208                         ifstream in2;
209                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
210                         in2.close();
211                         fName = tryPath;
212                     }
213                     
214                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
215                     else { flowFileVector.push_back(fName); }
216                     m->gobble(flowFilesFile);
217                 }
218                 flowFilesFile.close();
219                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
220             }
221             else{
222                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
223                 flowFileVector.push_back(flowFileName);
224             }
225                 
226                         //check for optional parameter and set defaults
227                         // ...at some point should added some additional type checking...
228                         string temp;
229                         temp = validParameter.validFile(parameters, "lookup", true);
230                         if (temp == "not found")        {       
231                                 lookupFileName = "LookUp_Titanium.pat"; 
232                                 
233                                 int ableToOpen;
234                                 ifstream in;
235                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
236                                 in.close();     
237                                 
238                                 //if you can't open it, try input location
239                                 if (ableToOpen == 1) {
240                                         if (inputDir != "") { //default path is set
241                                                 string tryPath = inputDir + lookupFileName;
242                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
243                                                 ifstream in2;
244                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
245                                                 in2.close();
246                                                 lookupFileName = tryPath;
247                                         }
248                                 }
249                                 
250                                 //if you can't open it, try default location
251                                 if (ableToOpen == 1) {
252                                         if (m->getDefaultPath() != "") { //default path is set
253                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
254                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
255                                                 ifstream in2;
256                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
257                                                 in2.close();
258                                                 lookupFileName = tryPath;
259                                         }
260                                 }
261                                 
262                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
263                                 if (ableToOpen == 1) {
264                                         string exepath = m->argv;
265                                         string tempPath = exepath;
266                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
267                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
268                                         
269                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
270                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
271                                         ifstream in2;
272                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
273                                         in2.close();
274                                         lookupFileName = tryPath;
275                                 }
276                                 
277                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
278                         }
279                         else if(temp == "not open")     {       
280                                 
281                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
282                                 
283                                 //if you can't open it its not inputDir, try mothur excutable location
284                                 string exepath = m->argv;
285                                 string tempPath = exepath;
286                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
287                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
288                                         
289                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
290                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
291                                 ifstream in2;
292                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
293                                 in2.close();
294                                 lookupFileName = tryPath;
295                                 
296                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
297                         }else                                           {       lookupFileName = temp;  }
298                         
299                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
300                         m->setProcessors(temp);
301                         m->mothurConvert(temp, processors);
302
303                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
304                         m->mothurConvert(temp, cutoff); 
305                         
306                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
307                         m->mothurConvert(temp, minDelta); 
308
309                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
310                         m->mothurConvert(temp, maxIters); 
311             
312             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
313                         m->mothurConvert(temp, largeSize); 
314             if (largeSize != 0) { large = true; }
315             else { large = false;  }
316             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
317             
318 #ifdef USE_MPI
319             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
320 #endif
321             
322
323                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
324                         m->mothurConvert(temp, sigma); 
325                         
326                         flowOrder = validParameter.validFile(parameters, "order", false);
327                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
328                         else if(flowOrder.length() != 4){
329                                 m->mothurOut("The value of the order option must be four bases long\n");
330                         }
331                         
332                 }
333 #ifdef USE_MPI
334                 }                               
335 #endif
336         }
337         catch(exception& e) {
338                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
339                 exit(1);
340         }
341 }
342 //**********************************************************************************************************************
343 #ifdef USE_MPI
344 int ShhherCommand::execute(){
345         try {
346                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
347                 
348                 int tag = 1976;
349                 MPI_Status status; 
350                         
351                 if(pid == 0){
352
353                         for(int i=1;i<ncpus;i++){
354                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
355                         }
356                         if(abort == 1){ return 0;       }
357
358                         processors = ncpus;
359                         
360                         m->mothurOut("\nGetting preliminary data...\n");
361                         getSingleLookUp();      if (m->control_pressed) { return 0; }
362                         getJointLookUp();       if (m->control_pressed) { return 0; }
363                         
364             vector<string> flowFileVector;
365                         if(flowFilesFileName != "not found"){
366                                 string fName;
367                 
368                                 ifstream flowFilesFile;
369                                 m->openInputFile(flowFilesFileName, flowFilesFile);
370                                 while(flowFilesFile){
371                                         fName = m->getline(flowFilesFile);
372                                         flowFileVector.push_back(fName);
373                                         m->gobble(flowFilesFile);
374                                 }
375                         }
376                         else{
377                                 flowFileVector.push_back(flowFileName);
378                         }
379             
380                         int numFiles = flowFileVector.size();
381
382                         for(int i=1;i<ncpus;i++){
383                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
384                         }
385                         
386                         for(int i=0;i<numFiles;i++){
387                                 
388                                 if (m->control_pressed) { break; }
389                                 
390                                 double begClock = clock();
391                                 unsigned long long begTime = time(NULL);
392
393                                 flowFileName = flowFileVector[i];
394                                 
395                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
396                                 m->mothurOut("Reading flowgrams...\n");
397                                 getFlowData();
398                                 
399                                 if (m->control_pressed) { break; }
400
401                                 m->mothurOut("Identifying unique flowgrams...\n");
402                                 getUniques();
403                                 
404                                 if (m->control_pressed) { break; }
405
406                                 m->mothurOut("Calculating distances between flowgrams...\n");
407                                 char fileName[1024];
408                                 strcpy(fileName, flowFileName.c_str());
409
410                                 for(int i=1;i<ncpus;i++){
411                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
412
413                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
414                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
415                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
416                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
417                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
418                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
419                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
420                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
421                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
422                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
423                                 }                               
424                                                         
425                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 int done;
430                                 for(int i=1;i<ncpus;i++){
431                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
432                                         
433                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
434                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
435                                 }
436
437                                 string namesFileName = createNamesFile();
438                                 
439                                 if (m->control_pressed) { break; }
440                                 
441                                 m->mothurOut("\nClustering flowgrams...\n");
442                                 string listFileName = cluster(distFileName, namesFileName);
443                                 
444                                 if (m->control_pressed) { break; }
445                                 
446                 
447                 
448                                 getOTUData(listFileName);
449
450                                 m->mothurRemove(distFileName);
451                                 m->mothurRemove(namesFileName);
452                                 m->mothurRemove(listFileName);
453                                 
454                                 if (m->control_pressed) { break; }
455                                 
456                                 initPyroCluster();
457
458                                 if (m->control_pressed) { break; }
459                                 
460             
461                                 for(int i=1;i<ncpus;i++){
462                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
463                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
464                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
465                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
466                                 }
467                                 
468                                 if (m->control_pressed) { break; }
469                                 
470                                 double maxDelta = 0;
471                                 int iter = 0;
472                                 
473                                 int numOTUsOnCPU = numOTUs / ncpus;
474                                 int numSeqsOnCPU = numSeqs / ncpus;
475                                 m->mothurOut("\nDenoising flowgrams...\n");
476                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
477                                 
478                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
479                                         
480                                         double cycClock = clock();
481                                         unsigned long long cycTime = time(NULL);
482                                         fill();
483                                         
484                                         if (m->control_pressed) { break; }
485
486                                         int total = singleTau.size();
487                                         for(int i=1;i<ncpus;i++){
488                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
489                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
490                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
491                                                 
492                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
493                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
494                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
495                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
496                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
497                                         }
498                                         
499                                         calcCentroidsDriver(0, numOTUsOnCPU);
500                                         
501                                         for(int i=1;i<ncpus;i++){
502                                                 int otuStart = i * numOTUs / ncpus;
503                                                 int otuStop = (i + 1) * numOTUs / ncpus;
504                                                 
505                                                 vector<int> tempCentroids(numOTUs, 0);
506                                                 vector<short> tempChange(numOTUs, 0);
507                                                 
508                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
509                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
510                                                 
511                                                 for(int j=otuStart;j<otuStop;j++){
512                                                         centroids[j] = tempCentroids[j];
513                                                         change[j] = tempChange[j];
514                                                 }
515                                         }
516                                                                         
517                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
518                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
519                                         checkCentroids(); if (m->control_pressed) { break; }
520                                         
521                                         for(int i=1;i<ncpus;i++){
522                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
523                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
524                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
525                                         }
526                                         
527                                         calcNewDistancesParent(0, numSeqsOnCPU);
528
529                                         total = singleTau.size();
530
531                                         for(int i=1;i<ncpus;i++){
532                                                 int childTotal;
533                                                 int seqStart = i * numSeqs / ncpus;
534                                                 int seqStop = (i + 1) * numSeqs / ncpus;
535                                                 
536                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
537
538                                                 vector<int> childSeqIndex(childTotal, 0);
539                                                 vector<double> childSingleTau(childTotal, 0);
540                                                 vector<double> childDist(numSeqs * numOTUs, 0);
541                                                 vector<int> otuIndex(childTotal, 0);
542                                                 
543                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
544                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
545                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
546                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
547
548                                                 int oldTotal = total;
549                                                 total += childTotal;
550                                                 singleTau.resize(total, 0);
551                                                 seqIndex.resize(total, 0);
552                                                 seqNumber.resize(total, 0);
553                                                 
554                                                 int childIndex = 0;
555                                                 
556                                                 for(int j=oldTotal;j<total;j++){
557                                                         int otuI = otuIndex[childIndex];
558                                                         int seqI = childSeqIndex[childIndex];
559                                                         
560                                                         singleTau[j] = childSingleTau[childIndex];
561                                                         
562                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
563                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
564                                                         nSeqsPerOTU[otuI]++;
565                                                         childIndex++;
566                                                 }
567                                                 
568                                                 int index = seqStart * numOTUs;
569                                                 for(int j=seqStart;j<seqStop;j++){
570                                                         for(int k=0;k<numOTUs;k++){
571                                                                 dist[index] = childDist[index];
572                                                                 index++;
573                                                         }
574                                                 }                                       
575                                         }
576                                         
577                                         iter++;
578                                         
579                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
580
581                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
582                                                 int live = 1;
583                                                 for(int i=1;i<ncpus;i++){
584                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
585                                                 }
586                                         }
587                                         else{
588                                                 int live = 0;
589                                                 for(int i=1;i<ncpus;i++){
590                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
591                                                 }
592                                         }
593                                         
594                                 }       
595                                 
596                                 if (m->control_pressed) { break; }
597                                 
598                                 m->mothurOut("\nFinalizing...\n");
599                                 fill();
600                                 
601                                 if (m->control_pressed) { break; }
602                                 
603                                 setOTUs();
604                                 
605                                 vector<int> otuCounts(numOTUs, 0);
606                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
607                                 calcCentroidsDriver(0, numOTUs);
608                                 
609                                 if (m->control_pressed) { break; }
610                                 
611                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
612                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
613                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
614                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
615                                 writeGroups();                          if (m->control_pressed) { break; }
616                                 
617                                                                  
618                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
619                         }
620                 }
621                 else{
622                         int abort = 1;
623
624                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
625                         if(abort){      return 0;       }
626
627                         int numFiles;
628                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629
630                         for(int i=0;i<numFiles;i++){
631                                 
632                                 if (m->control_pressed) { break; }
633                                 
634                                 //Now into the pyrodist part
635                                 bool live = 1;
636
637                                 char fileName[1024];
638                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
639                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
640                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
642                                 
643                                 flowDataIntI.resize(numSeqs * numFlowCells);
644                                 flowDataPrI.resize(numSeqs * numFlowCells);
645                                 mapUniqueToSeq.resize(numSeqs);
646                                 mapSeqToUnique.resize(numSeqs);
647                                 lengths.resize(numSeqs);
648                                 jointLookUp.resize(NUMBINS * NUMBINS);
649
650                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
651                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
652                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
653                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
654                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
655                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
656                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
657
658                                 flowFileName = string(fileName);
659                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
660                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
661                                 
662                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
663                                 
664                                 if (m->control_pressed) { break; }
665                                 
666                                 int done = 1;
667                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
668
669                                 //Now into the pyronoise part
670                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
671                                 
672                                 singleLookUp.resize(HOMOPS * NUMBINS);
673                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
674                                 weight.resize(numOTUs);
675                                 centroids.resize(numOTUs);
676                                 change.resize(numOTUs);
677                                 dist.assign(numOTUs * numSeqs, 0);
678                                 nSeqsPerOTU.resize(numOTUs);
679                                 cumNumSeqs.resize(numOTUs);
680
681                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
682                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
683                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
684                                 
685                                 int startOTU = pid * numOTUs / ncpus;
686                                 int endOTU = (pid + 1) * numOTUs / ncpus;
687
688                                 int startSeq = pid * numSeqs / ncpus;
689                                 int endSeq = (pid + 1) * numSeqs /ncpus;
690                                 
691                                 int total;
692
693                                 while(live){
694                                         
695                                         if (m->control_pressed) { break; }
696                                         
697                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                         singleTau.assign(total, 0.0000);
699                                         seqNumber.assign(total, 0);
700                                         seqIndex.assign(total, 0);
701
702                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
703                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
706                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
707                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
708                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
709
710                                         calcCentroidsDriver(startOTU, endOTU);
711
712                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
713                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
714
715                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
717                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
718
719                                         vector<int> otuIndex(total, 0);
720                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
721                                         total = otuIndex.size();
722                                         
723                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
724                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
725                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
726                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
727                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
728                                 
729                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
730                                 }
731                         }
732                 }
733                 
734                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
735                 
736                 MPI_Barrier(MPI_COMM_WORLD);
737
738                 
739                 if(compositeFASTAFileName != ""){
740                         outputNames.push_back(compositeFASTAFileName);
741                         outputNames.push_back(compositeNamesFileName);
742                 }
743
744                 m->mothurOutEndLine();
745                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
746                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
747                 m->mothurOutEndLine();
748                 
749                 return 0;
750
751         }
752         catch(exception& e) {
753                 m->errorOut(e, "ShhherCommand", "execute");
754                 exit(1);
755         }
756 }
757 /**************************************************************************************************/
758 string ShhherCommand::createNamesFile(){
759         try{
760                 
761                 vector<string> duplicateNames(numUniques, "");
762                 for(int i=0;i<numSeqs;i++){
763                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
764                 }
765                 
766                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
767                 
768                 ofstream nameFile;
769                 m->openOutputFile(nameFileName, nameFile);
770                 
771                 for(int i=0;i<numUniques;i++){
772                         
773                         if (m->control_pressed) { break; }
774                         
775             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
776                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
777                 }
778                 
779                 nameFile.close();
780                 return  nameFileName;
781         }
782         catch(exception& e) {
783                 m->errorOut(e, "ShhherCommand", "createNamesFile");
784                 exit(1);
785         }
786 }
787 /**************************************************************************************************/
788
789 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
790         try{            
791                 ostringstream outStream;
792                 outStream.setf(ios::fixed, ios::floatfield);
793                 outStream.setf(ios::dec, ios::basefield);
794                 outStream.setf(ios::showpoint);
795                 outStream.precision(6);
796                 
797                 int begTime = time(NULL);
798                 double begClock = clock();
799                 
800                 for(int i=startSeq;i<stopSeq;i++){
801                         
802                         if (m->control_pressed) { break; }
803                         
804                         for(int j=0;j<i;j++){
805                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
806                                 
807                                 if(flowDistance < 1e-6){
808                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
809                                 }
810                                 else if(flowDistance <= cutoff){
811                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
812                                 }
813                         }
814                         if(i % 100 == 0){
815                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
816                         }
817                 }
818                 
819                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
820                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
821                 
822                 if (m->control_pressed) { return fDistFileName; }
823                 
824                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
825
826                 ofstream distFile(fDistFileName.c_str());
827                 distFile << outStream.str();            
828                 distFile.close();
829                 
830                 return fDistFileName;
831         }
832         catch(exception& e) {
833                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
834                 exit(1);
835         }
836 }
837 /**************************************************************************************************/
838  
839 void ShhherCommand::getOTUData(string listFileName){
840     try {
841         
842         ifstream listFile;
843         m->openInputFile(listFileName, listFile);
844         string label;
845         
846         listFile >> label >> numOTUs;
847         
848         otuData.assign(numSeqs, 0);
849         cumNumSeqs.assign(numOTUs, 0);
850         nSeqsPerOTU.assign(numOTUs, 0);
851         aaP.clear();aaP.resize(numOTUs);
852         
853         seqNumber.clear();
854         aaI.clear();
855         seqIndex.clear();
856         
857         string singleOTU = "";
858         
859         for(int i=0;i<numOTUs;i++){
860             
861             if (m->control_pressed) { break; }
862             
863             listFile >> singleOTU;
864             
865             istringstream otuString(singleOTU);
866             
867             while(otuString){
868                 
869                 string seqName = "";
870                 
871                 for(int j=0;j<singleOTU.length();j++){
872                     char letter = otuString.get();
873                     
874                     if(letter != ','){
875                         seqName += letter;
876                     }
877                     else{
878                         map<string,int>::iterator nmIt = nameMap.find(seqName);
879                         int index = nmIt->second;
880                         
881                         nameMap.erase(nmIt);
882                         
883                         otuData[index] = i;
884                         nSeqsPerOTU[i]++;
885                         aaP[i].push_back(index);
886                         seqName = "";
887                     }
888                 }
889                 
890                 map<string,int>::iterator nmIt = nameMap.find(seqName);
891                 
892                 int index = nmIt->second;
893                 nameMap.erase(nmIt);
894                 
895                 otuData[index] = i;
896                 nSeqsPerOTU[i]++;
897                 aaP[i].push_back(index);        
898                 
899                 otuString.get();
900             }
901             
902             sort(aaP[i].begin(), aaP[i].end());
903             for(int j=0;j<nSeqsPerOTU[i];j++){
904                 seqNumber.push_back(aaP[i][j]);
905             }
906             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
907                 aaP[i].push_back(0);
908             }
909             
910             
911         }
912         
913         for(int i=1;i<numOTUs;i++){
914             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
915         }
916         aaI = aaP;
917         seqIndex = seqNumber;
918         
919         listFile.close();       
920         
921     }
922     catch(exception& e) {
923         m->errorOut(e, "ShhherCommand", "getOTUData");
924         exit(1);        
925     }           
926 }
927
928 /**************************************************************************************************/
929
930 void ShhherCommand::initPyroCluster(){                          
931     try{
932         if (numOTUs < processors) { processors = 1; }
933         
934         dist.assign(numSeqs * numOTUs, 0);
935         change.assign(numOTUs, 1);
936         centroids.assign(numOTUs, -1);
937         weight.assign(numOTUs, 0);
938         singleTau.assign(numSeqs, 1.0);
939         
940         nSeqsBreaks.assign(processors+1, 0);
941         nOTUsBreaks.assign(processors+1, 0);
942         
943         nSeqsBreaks[0] = 0;
944         for(int i=0;i<processors;i++){
945             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
946             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
947         }
948         nSeqsBreaks[processors] = numSeqs;
949         nOTUsBreaks[processors] = numOTUs;
950     }
951     catch(exception& e) {
952         m->errorOut(e, "ShhherCommand", "initPyroCluster");
953         exit(1);        
954     }
955 }
956
957 /**************************************************************************************************/
958
959 void ShhherCommand::fill(){
960     try {
961         int index = 0;
962         for(int i=0;i<numOTUs;i++){
963             
964             if (m->control_pressed) { break; }
965             
966             cumNumSeqs[i] = index;
967             for(int j=0;j<nSeqsPerOTU[i];j++){
968                 seqNumber[index] = aaP[i][j];
969                 seqIndex[index] = aaI[i][j];
970                 
971                 index++;
972             }
973         }
974     }
975     catch(exception& e) {
976         m->errorOut(e, "ShhherCommand", "fill");
977         exit(1);        
978     }           
979 }
980
981 /**************************************************************************************************/
982  
983 void ShhherCommand::getFlowData(){
984     try{
985         ifstream flowFile;
986         m->openInputFile(flowFileName, flowFile);
987         
988         string seqName;
989         seqNameVector.clear();
990         lengths.clear();
991         flowDataIntI.clear();
992         nameMap.clear();
993         
994         
995         int currentNumFlowCells;
996         
997         float intensity;
998         
999         flowFile >> numFlowCells;
1000         int index = 0;//pcluster
1001         while(!flowFile.eof()){
1002             
1003             if (m->control_pressed) { break; }
1004             
1005             flowFile >> seqName >> currentNumFlowCells;
1006             lengths.push_back(currentNumFlowCells);
1007             
1008             seqNameVector.push_back(seqName);
1009             nameMap[seqName] = index++;//pcluster
1010             
1011             for(int i=0;i<numFlowCells;i++){
1012                 flowFile >> intensity;
1013                 if(intensity > 9.99)    {       intensity = 9.99;       }
1014                 int intI = int(100 * intensity + 0.0001);
1015                 flowDataIntI.push_back(intI);
1016             }
1017             m->gobble(flowFile);
1018         }
1019         flowFile.close();
1020         
1021         numSeqs = seqNameVector.size();         
1022         
1023         for(int i=0;i<numSeqs;i++){
1024             
1025             if (m->control_pressed) { break; }
1026             
1027             int iNumFlowCells = i * numFlowCells;
1028             for(int j=lengths[i];j<numFlowCells;j++){
1029                 flowDataIntI[iNumFlowCells + j] = 0;
1030             }
1031         }
1032         
1033     }
1034     catch(exception& e) {
1035         m->errorOut(e, "ShhherCommand", "getFlowData");
1036         exit(1);
1037     }
1038 }
1039 /**************************************************************************************************/
1040 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1041         
1042         try{
1043                 vector<double> newTau(numOTUs,0);
1044                 vector<double> norms(numSeqs, 0);
1045                 otuIndex.clear();
1046                 seqIndex.clear();
1047                 singleTau.clear();
1048                 
1049                 for(int i=startSeq;i<stopSeq;i++){
1050                         
1051                         if (m->control_pressed) { break; }
1052                         
1053                         double offset = 1e8;
1054                         int indexOffset = i * numOTUs;
1055                         
1056                         for(int j=0;j<numOTUs;j++){
1057                                 
1058                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1059                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1060                                 }
1061                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1062                                         offset = dist[indexOffset + j];
1063                                 }
1064                         }
1065                         
1066                         for(int j=0;j<numOTUs;j++){
1067                                 if(weight[j] > MIN_WEIGHT){
1068                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1069                                         norms[i] += newTau[j];
1070                                 }
1071                                 else{
1072                                         newTau[j] = 0.0;
1073                                 }
1074                         }
1075                         
1076                         for(int j=0;j<numOTUs;j++){
1077                 
1078                                 newTau[j] /= norms[i];
1079                                 
1080                                 if(newTau[j] > MIN_TAU){
1081                                         otuIndex.push_back(j);
1082                                         seqIndex.push_back(i);
1083                                         singleTau.push_back(newTau[j]);
1084                                 }
1085                         }
1086                         
1087                 }
1088         }
1089         catch(exception& e) {
1090                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1091                 exit(1);        
1092         }               
1093 }
1094
1095 /**************************************************************************************************/
1096
1097 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1098     
1099     try{
1100         
1101         int total = 0;
1102         vector<double> newTau(numOTUs,0);
1103         vector<double> norms(numSeqs, 0);
1104         nSeqsPerOTU.assign(numOTUs, 0);
1105         
1106         for(int i=startSeq;i<stopSeq;i++){
1107             
1108             if (m->control_pressed) { break; }
1109             
1110             int indexOffset = i * numOTUs;
1111             
1112             double offset = 1e8;
1113             
1114             for(int j=0;j<numOTUs;j++){
1115                 
1116                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1117                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1118                 }
1119                 
1120                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1121                     offset = dist[indexOffset + j];
1122                 }
1123             }
1124             
1125             for(int j=0;j<numOTUs;j++){
1126                 if(weight[j] > MIN_WEIGHT){
1127                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1128                     norms[i] += newTau[j];
1129                 }
1130                 else{
1131                     newTau[j] = 0.0;
1132                 }
1133             }
1134             
1135             for(int j=0;j<numOTUs;j++){
1136                 newTau[j] /= norms[i];
1137             }
1138             
1139             for(int j=0;j<numOTUs;j++){
1140                 if(newTau[j] > MIN_TAU){
1141                     
1142                     int oldTotal = total;
1143                     
1144                     total++;
1145                     
1146                     singleTau.resize(total, 0);
1147                     seqNumber.resize(total, 0);
1148                     seqIndex.resize(total, 0);
1149                     
1150                     singleTau[oldTotal] = newTau[j];
1151                     
1152                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1153                     aaI[j][nSeqsPerOTU[j]] = i;
1154                     nSeqsPerOTU[j]++;
1155                 }
1156             }
1157             
1158         }
1159         
1160     }
1161     catch(exception& e) {
1162         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1163         exit(1);        
1164     }           
1165 }
1166
1167 /**************************************************************************************************/
1168
1169 void ShhherCommand::setOTUs(){
1170     
1171     try {
1172         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1173         
1174         for(int i=0;i<numOTUs;i++){
1175             
1176             if (m->control_pressed) { break; }
1177             
1178             for(int j=0;j<nSeqsPerOTU[i];j++){
1179                 int index = cumNumSeqs[i] + j;
1180                 double tauValue = singleTau[seqNumber[index]];
1181                 int sIndex = seqIndex[index];
1182                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1183             }
1184         }
1185         
1186         for(int i=0;i<numSeqs;i++){
1187             double maxTau = -1.0000;
1188             int maxOTU = -1;
1189             for(int j=0;j<numOTUs;j++){
1190                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1191                     maxTau = bigTauMatrix[i * numOTUs + j];
1192                     maxOTU = j;
1193                 }
1194             }
1195             
1196             otuData[i] = maxOTU;
1197         }
1198         
1199         nSeqsPerOTU.assign(numOTUs, 0);         
1200         
1201         for(int i=0;i<numSeqs;i++){
1202             int index = otuData[i];
1203             
1204             singleTau[i] = 1.0000;
1205             dist[i] = 0.0000;
1206             
1207             aaP[index][nSeqsPerOTU[index]] = i;
1208             aaI[index][nSeqsPerOTU[index]] = i;
1209             
1210             nSeqsPerOTU[index]++;
1211         }
1212         fill(); 
1213     }
1214     catch(exception& e) {
1215         m->errorOut(e, "ShhherCommand", "setOTUs");
1216         exit(1);        
1217     }           
1218 }
1219
1220 /**************************************************************************************************/
1221  
1222 void ShhherCommand::getUniques(){
1223     try{
1224         
1225         
1226         numUniques = 0;
1227         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1228         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1229         uniqueLengths.assign(numSeqs, 0);
1230         mapSeqToUnique.assign(numSeqs, -1);
1231         mapUniqueToSeq.assign(numSeqs, -1);
1232         
1233         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1234         
1235         for(int i=0;i<numSeqs;i++){
1236             
1237             if (m->control_pressed) { break; }
1238             
1239             int index = 0;
1240             
1241             vector<short> current(numFlowCells);
1242             for(int j=0;j<numFlowCells;j++){
1243                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1244             }
1245             
1246             for(int j=0;j<numUniques;j++){
1247                 int offset = j * numFlowCells;
1248                 bool toEnd = 1;
1249                 
1250                 int shorterLength;
1251                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1252                 else                                                            {       shorterLength = uniqueLengths[j];       }
1253                 
1254                 for(int k=0;k<shorterLength;k++){
1255                     if(current[k] != uniqueFlowgrams[offset + k]){
1256                         toEnd = 0;
1257                         break;
1258                     }
1259                 }
1260                 
1261                 if(toEnd){
1262                     mapSeqToUnique[i] = j;
1263                     uniqueCount[j]++;
1264                     index = j;
1265                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1266                     break;
1267                 }
1268                 index++;
1269             }
1270             
1271             if(index == numUniques){
1272                 uniqueLengths[numUniques] = lengths[i];
1273                 uniqueCount[numUniques] = 1;
1274                 mapSeqToUnique[i] = numUniques;//anMap
1275                 mapUniqueToSeq[numUniques] = i;//anF
1276                 
1277                 for(int k=0;k<numFlowCells;k++){
1278                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1279                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1280                 }
1281                 
1282                 numUniques++;
1283             }
1284         }
1285         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1286         uniqueLengths.resize(numUniques);       
1287         
1288         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1289         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1290     }
1291     catch(exception& e) {
1292         m->errorOut(e, "ShhherCommand", "getUniques");
1293         exit(1);
1294     }
1295 }
1296
1297 /**************************************************************************************************/
1298
1299 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1300     try{
1301         int minLength = lengths[mapSeqToUnique[seqA]];
1302         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1303         
1304         int ANumFlowCells = seqA * numFlowCells;
1305         int BNumFlowCells = seqB * numFlowCells;
1306         
1307         float dist = 0;
1308         
1309         for(int i=0;i<minLength;i++){
1310             
1311             if (m->control_pressed) { break; }
1312             
1313             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1314             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1315             
1316             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1317             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1318             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1319         }
1320         
1321         dist /= (float) minLength;
1322         return dist;
1323     }
1324     catch(exception& e) {
1325         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1326         exit(1);
1327     }
1328 }
1329
1330 //**********************************************************************************************************************/
1331
1332 string ShhherCommand::cluster(string distFileName, string namesFileName){
1333     try {
1334         
1335         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1336         read->setCutoff(cutoff);
1337         
1338         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1339         clusterNameMap->readMap();
1340         read->read(clusterNameMap);
1341         
1342         ListVector* list = read->getListVector();
1343         SparseMatrix* matrix = read->getMatrix();
1344         
1345         delete read; 
1346         delete clusterNameMap; 
1347         
1348         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1349         
1350         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1351         string tag = cluster->getTag();
1352         
1353         double clusterCutoff = cutoff;
1354         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1355             
1356             if (m->control_pressed) { break; }
1357             
1358             cluster->update(clusterCutoff);
1359         }
1360         
1361         list->setLabel(toString(cutoff));
1362         
1363         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1364         ofstream listFile;
1365         m->openOutputFile(listFileName, listFile);
1366         list->print(listFile);
1367         listFile.close();
1368         
1369         delete matrix;  delete cluster; delete rabund; delete list;
1370         
1371         return listFileName;
1372     }
1373     catch(exception& e) {
1374         m->errorOut(e, "ShhherCommand", "cluster");
1375         exit(1);        
1376     }           
1377 }
1378
1379 /**************************************************************************************************/
1380
1381 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1382     
1383     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1384     //within an otu
1385     
1386     try{
1387         
1388         for(int i=start;i<finish;i++){
1389             
1390             if (m->control_pressed) { break; }
1391             
1392             double count = 0;
1393             int position = 0;
1394             int minFlowGram = 100000000;
1395             double minFlowValue = 1e8;
1396             change[i] = 0; //FALSE
1397             
1398             for(int j=0;j<nSeqsPerOTU[i];j++){
1399                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1400             }
1401             
1402             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1403                 vector<double> adF(nSeqsPerOTU[i]);
1404                 vector<int> anL(nSeqsPerOTU[i]);
1405                 
1406                 for(int j=0;j<nSeqsPerOTU[i];j++){
1407                     int index = cumNumSeqs[i] + j;
1408                     int nI = seqIndex[index];
1409                     int nIU = mapSeqToUnique[nI];
1410                     
1411                     int k;
1412                     for(k=0;k<position;k++){
1413                         if(nIU == anL[k]){
1414                             break;
1415                         }
1416                     }
1417                     if(k == position){
1418                         anL[position] = nIU;
1419                         adF[position] = 0.0000;
1420                         position++;
1421                     }                                           
1422                 }
1423                 
1424                 for(int j=0;j<nSeqsPerOTU[i];j++){
1425                     int index = cumNumSeqs[i] + j;
1426                     int nI = seqIndex[index];
1427                     
1428                     double tauValue = singleTau[seqNumber[index]];
1429                     
1430                     for(int k=0;k<position;k++){
1431                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1432                         adF[k] += dist * tauValue;
1433                     }
1434                 }
1435                 
1436                 for(int j=0;j<position;j++){
1437                     if(adF[j] < minFlowValue){
1438                         minFlowGram = j;
1439                         minFlowValue = adF[j];
1440                     }
1441                 }
1442                 
1443                 if(centroids[i] != anL[minFlowGram]){
1444                     change[i] = 1;
1445                     centroids[i] = anL[minFlowGram];
1446                 }
1447             }
1448             else if(centroids[i] != -1){
1449                 change[i] = 1;
1450                 centroids[i] = -1;                      
1451             }
1452         }
1453     }
1454     catch(exception& e) {
1455         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1456         exit(1);        
1457     }           
1458 }
1459
1460 /**************************************************************************************************/
1461
1462 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1463     try{
1464         
1465         int flowAValue = cent * numFlowCells;
1466         int flowBValue = flow * numFlowCells;
1467         
1468         double dist = 0;
1469         
1470         for(int i=0;i<length;i++){
1471             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1472             flowAValue++;
1473             flowBValue++;
1474         }
1475         
1476         return dist / (double)length;
1477     }
1478     catch(exception& e) {
1479         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1480         exit(1);        
1481     }           
1482 }
1483
1484 /**************************************************************************************************/
1485
1486 double ShhherCommand::getNewWeights(){
1487     try{
1488         
1489         double maxChange = 0;
1490         
1491         for(int i=0;i<numOTUs;i++){
1492             
1493             if (m->control_pressed) { break; }
1494             
1495             double difference = weight[i];
1496             weight[i] = 0;
1497             
1498             for(int j=0;j<nSeqsPerOTU[i];j++){
1499                 int index = cumNumSeqs[i] + j;
1500                 double tauValue = singleTau[seqNumber[index]];
1501                 weight[i] += tauValue;
1502             }
1503             
1504             difference = fabs(weight[i] - difference);
1505             if(difference > maxChange){ maxChange = difference; }
1506         }
1507         return maxChange;
1508     }
1509     catch(exception& e) {
1510         m->errorOut(e, "ShhherCommand", "getNewWeights");
1511         exit(1);        
1512     }           
1513 }
1514  
1515  /**************************************************************************************************/
1516  
1517 double ShhherCommand::getLikelihood(){
1518     
1519     try{
1520         
1521         vector<long double> P(numSeqs, 0);
1522         int effNumOTUs = 0;
1523         
1524         for(int i=0;i<numOTUs;i++){
1525             if(weight[i] > MIN_WEIGHT){
1526                 effNumOTUs++;
1527             }
1528         }
1529         
1530         string hold;
1531         for(int i=0;i<numOTUs;i++){
1532             
1533             if (m->control_pressed) { break; }
1534             
1535             for(int j=0;j<nSeqsPerOTU[i];j++){
1536                 int index = cumNumSeqs[i] + j;
1537                 int nI = seqIndex[index];
1538                 double singleDist = dist[seqNumber[index]];
1539                 
1540                 P[nI] += weight[i] * exp(-singleDist * sigma);
1541             }
1542         }
1543         double nLL = 0.00;
1544         for(int i=0;i<numSeqs;i++){
1545             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1546             
1547             nLL += -log(P[i]);
1548         }
1549         
1550         nLL = nLL -(double)numSeqs * log(sigma);
1551         
1552         return nLL; 
1553     }
1554     catch(exception& e) {
1555         m->errorOut(e, "ShhherCommand", "getNewWeights");
1556         exit(1);        
1557     }           
1558 }
1559
1560 /**************************************************************************************************/
1561
1562 void ShhherCommand::checkCentroids(){
1563     try{
1564         vector<int> unique(numOTUs, 1);
1565         
1566         for(int i=0;i<numOTUs;i++){
1567             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1568                 unique[i] = -1;
1569             }
1570         }
1571         
1572         for(int i=0;i<numOTUs;i++){
1573             
1574             if (m->control_pressed) { break; }
1575             
1576             if(unique[i] == 1){
1577                 for(int j=i+1;j<numOTUs;j++){
1578                     if(unique[j] == 1){
1579                         
1580                         if(centroids[j] == centroids[i]){
1581                             unique[j] = 0;
1582                             centroids[j] = -1;
1583                             
1584                             weight[i] += weight[j];
1585                             weight[j] = 0.0;
1586                         }
1587                     }
1588                 }
1589             }
1590         }
1591     }
1592     catch(exception& e) {
1593         m->errorOut(e, "ShhherCommand", "checkCentroids");
1594         exit(1);        
1595     }           
1596 }
1597  /**************************************************************************************************/
1598
1599
1600  
1601 void ShhherCommand::writeQualities(vector<int> otuCounts){
1602     
1603     try {
1604         string thisOutputDir = outputDir;
1605         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1606         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1607         
1608         ofstream qualityFile;
1609         m->openOutputFile(qualityFileName, qualityFile);
1610         
1611         qualityFile.setf(ios::fixed, ios::floatfield);
1612         qualityFile.setf(ios::showpoint);
1613         qualityFile << setprecision(6);
1614         
1615         vector<vector<int> > qualities(numOTUs);
1616         vector<double> pr(HOMOPS, 0);
1617         
1618         
1619         for(int i=0;i<numOTUs;i++){
1620             
1621             if (m->control_pressed) { break; }
1622             
1623             int index = 0;
1624             int base = 0;
1625             
1626             if(nSeqsPerOTU[i] > 0){
1627                 qualities[i].assign(1024, -1);
1628                 
1629                 while(index < numFlowCells){
1630                     double maxPrValue = 1e8;
1631                     short maxPrIndex = -1;
1632                     double count = 0.0000;
1633                     
1634                     pr.assign(HOMOPS, 0);
1635                     
1636                     for(int j=0;j<nSeqsPerOTU[i];j++){
1637                         int lIndex = cumNumSeqs[i] + j;
1638                         double tauValue = singleTau[seqNumber[lIndex]];
1639                         int sequenceIndex = aaI[i][j];
1640                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1641                         
1642                         count += tauValue;
1643                         
1644                         for(int s=0;s<HOMOPS;s++){
1645                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1646                         }
1647                     }
1648                     
1649                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1650                     maxPrValue = pr[maxPrIndex];
1651                     
1652                     if(count > MIN_COUNT){
1653                         double U = 0.0000;
1654                         double norm = 0.0000;
1655                         
1656                         for(int s=0;s<HOMOPS;s++){
1657                             norm += exp(-(pr[s] - maxPrValue));
1658                         }
1659                         
1660                         for(int s=1;s<=maxPrIndex;s++){
1661                             int value = 0;
1662                             double temp = 0.0000;
1663                             
1664                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1665                             
1666                             if(U>0.00){
1667                                 temp = log10(U);
1668                             }
1669                             else{
1670                                 temp = -10.1;
1671                             }
1672                             temp = floor(-10 * temp);
1673                             value = (int)floor(temp);
1674                             if(value > 100){    value = 100;    }
1675                             
1676                             qualities[i][base] = (int)value;
1677                             base++;
1678                         }
1679                     }
1680                     
1681                     index++;
1682                 }
1683             }
1684             
1685             
1686             if(otuCounts[i] > 0){
1687                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1688                 
1689                 int j=4;        //need to get past the first four bases
1690                 while(qualities[i][j] != -1){
1691                     qualityFile << qualities[i][j] << ' ';
1692                     j++;
1693                 }
1694                 qualityFile << endl;
1695             }
1696         }
1697         qualityFile.close();
1698         outputNames.push_back(qualityFileName);
1699         
1700     }
1701     catch(exception& e) {
1702         m->errorOut(e, "ShhherCommand", "writeQualities");
1703         exit(1);        
1704     }           
1705 }
1706
1707 /**************************************************************************************************/
1708
1709 void ShhherCommand::writeSequences(vector<int> otuCounts){
1710     try {
1711         string thisOutputDir = outputDir;
1712         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1713         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1714         ofstream fastaFile;
1715         m->openOutputFile(fastaFileName, fastaFile);
1716         
1717         vector<string> names(numOTUs, "");
1718         
1719         for(int i=0;i<numOTUs;i++){
1720             
1721             if (m->control_pressed) { break; }
1722             
1723             int index = centroids[i];
1724             
1725             if(otuCounts[i] > 0){
1726                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1727                 
1728                 string newSeq = "";
1729                 
1730                 for(int j=0;j<numFlowCells;j++){
1731                     
1732                     char base = flowOrder[j % 4];
1733                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1734                         newSeq += base;
1735                     }
1736                 }
1737                 
1738                 fastaFile << newSeq.substr(4) << endl;
1739             }
1740         }
1741         fastaFile.close();
1742         
1743         outputNames.push_back(fastaFileName);
1744         
1745         if(compositeFASTAFileName != ""){
1746             m->appendFiles(fastaFileName, compositeFASTAFileName);
1747         }
1748     }
1749     catch(exception& e) {
1750         m->errorOut(e, "ShhherCommand", "writeSequences");
1751         exit(1);        
1752     }           
1753 }
1754
1755 /**************************************************************************************************/
1756
1757 void ShhherCommand::writeNames(vector<int> otuCounts){
1758     try {
1759         string thisOutputDir = outputDir;
1760         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1761         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1762         ofstream nameFile;
1763         m->openOutputFile(nameFileName, nameFile);
1764         
1765         for(int i=0;i<numOTUs;i++){
1766             
1767             if (m->control_pressed) { break; }
1768             
1769             if(otuCounts[i] > 0){
1770                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1771                 
1772                 for(int j=1;j<nSeqsPerOTU[i];j++){
1773                     nameFile << ',' << seqNameVector[aaI[i][j]];
1774                 }
1775                 
1776                 nameFile << endl;
1777             }
1778         }
1779         nameFile.close();
1780         outputNames.push_back(nameFileName);
1781         
1782         
1783         if(compositeNamesFileName != ""){
1784             m->appendFiles(nameFileName, compositeNamesFileName);
1785         }               
1786     }
1787     catch(exception& e) {
1788         m->errorOut(e, "ShhherCommand", "writeNames");
1789         exit(1);        
1790     }           
1791 }
1792
1793 /**************************************************************************************************/
1794
1795 void ShhherCommand::writeGroups(){
1796     try {
1797         string thisOutputDir = outputDir;
1798         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1799         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1800         string groupFileName = fileRoot + "shhh.groups";
1801         ofstream groupFile;
1802         m->openOutputFile(groupFileName, groupFile);
1803         
1804         for(int i=0;i<numSeqs;i++){
1805             if (m->control_pressed) { break; }
1806             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1807         }
1808         groupFile.close();
1809         outputNames.push_back(groupFileName);
1810         
1811     }
1812     catch(exception& e) {
1813         m->errorOut(e, "ShhherCommand", "writeGroups");
1814         exit(1);        
1815     }           
1816 }
1817
1818 /**************************************************************************************************/
1819
1820 void ShhherCommand::writeClusters(vector<int> otuCounts){
1821     try {
1822         string thisOutputDir = outputDir;
1823         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1824         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1825         ofstream otuCountsFile;
1826         m->openOutputFile(otuCountsFileName, otuCountsFile);
1827         
1828         string bases = flowOrder;
1829         
1830         for(int i=0;i<numOTUs;i++){
1831             
1832             if (m->control_pressed) {
1833                 break;
1834             }
1835             //output the translated version of the centroid sequence for the otu
1836             if(otuCounts[i] > 0){
1837                 int index = centroids[i];
1838                 
1839                 otuCountsFile << "ideal\t";
1840                 for(int j=8;j<numFlowCells;j++){
1841                     char base = bases[j % 4];
1842                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1843                         otuCountsFile << base;
1844                     }
1845                 }
1846                 otuCountsFile << endl;
1847                 
1848                 for(int j=0;j<nSeqsPerOTU[i];j++){
1849                     int sequence = aaI[i][j];
1850                     otuCountsFile << seqNameVector[sequence] << '\t';
1851                     
1852                     string newSeq = "";
1853                     
1854                     for(int k=0;k<lengths[sequence];k++){
1855                         char base = bases[k % 4];
1856                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1857                         
1858                         for(int s=0;s<freq;s++){
1859                             newSeq += base;
1860                             //otuCountsFile << base;
1861                         }
1862                     }
1863                     otuCountsFile << newSeq.substr(4) << endl;
1864                 }
1865                 otuCountsFile << endl;
1866             }
1867         }
1868         otuCountsFile.close();
1869         outputNames.push_back(otuCountsFileName);
1870         
1871     }
1872     catch(exception& e) {
1873         m->errorOut(e, "ShhherCommand", "writeClusters");
1874         exit(1);        
1875     }           
1876 }
1877
1878 #else
1879 //**********************************************************************************************************************
1880
1881 int ShhherCommand::execute(){
1882         try {
1883                 if (abort == true) { return 0; }
1884                 
1885                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1886                 getJointLookUp();       if (m->control_pressed) { return 0; }
1887                 
1888         int numFiles = flowFileVector.size();
1889                 
1890         if (numFiles < processors) { processors = numFiles; }
1891         
1892 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1893         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1894         else { createProcesses(flowFileVector); } //each processor processes one file
1895 #else
1896         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1897 #endif
1898         
1899                 if(compositeFASTAFileName != ""){
1900                         outputNames.push_back(compositeFASTAFileName);
1901                         outputNames.push_back(compositeNamesFileName);
1902                 }
1903
1904                 m->mothurOutEndLine();
1905                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1906                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1907                 m->mothurOutEndLine();
1908                 
1909                 return 0;
1910         }
1911         catch(exception& e) {
1912                 m->errorOut(e, "ShhherCommand", "execute");
1913                 exit(1);
1914         }
1915 }
1916 #endif
1917 /**************************************************************************************************/
1918
1919 int ShhherCommand::createProcesses(vector<string> filenames){
1920     try {
1921         vector<int> processIDS;
1922                 int process = 1;
1923                 int num = 0;
1924                 
1925                 //sanity check
1926                 if (filenames.size() < processors) { processors = filenames.size(); }
1927                 
1928                 //divide the groups between the processors
1929                 vector<linePair> lines;
1930                 int numFilesPerProcessor = filenames.size() / processors;
1931                 for (int i = 0; i < processors; i++) {
1932                         int startIndex =  i * numFilesPerProcessor;
1933                         int endIndex = (i+1) * numFilesPerProcessor;
1934                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1935                         lines.push_back(linePair(startIndex, endIndex));
1936                 }
1937                 
1938         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1939                 
1940                 //loop through and create all the processes you want
1941                 while (process != processors) {
1942                         int pid = fork();
1943                         
1944                         if (pid > 0) {
1945                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1946                                 process++;
1947                         }else if (pid == 0){
1948                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1949                                 exit(0);
1950                         }else { 
1951                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1952                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1953                                 exit(0);
1954                         }
1955                 }
1956                 
1957                 //do my part
1958                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1959                 
1960                 //force parent to wait until all the processes are done
1961                 for (int i=0;i<processIDS.size();i++) { 
1962                         int temp = processIDS[i];
1963                         wait(&temp);
1964                 }
1965         
1966         #else
1967         
1968         //////////////////////////////////////////////////////////////////////////////////////////////////////
1969         
1970         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1971         
1972         //////////////////////////////////////////////////////////////////////////////////////////////////////
1973                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1974                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1975                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1976                 
1977                 vector<shhhFlowsData*> pDataArray; 
1978                 DWORD   dwThreadIdArray[processors-1];
1979                 HANDLE  hThreadArray[processors-1]; 
1980                 
1981                 //Create processor worker threads.
1982                 for( int i=0; i<processors-1; i++ ){
1983                         // Allocate memory for thread data.
1984                         string extension = "";
1985                         if (i != 0) { extension = toString(i) + ".temp"; }
1986                         
1987             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1988                         pDataArray.push_back(tempFlow);
1989                         processIDS.push_back(i);
1990             
1991                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1992                 }
1993                 
1994                 //using the main process as a worker saves time and memory
1995                 //do my part
1996                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1997                 
1998                 //Wait until all threads have terminated.
1999                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2000                 
2001                 //Close all thread handles and free memory allocations.
2002                 for(int i=0; i < pDataArray.size(); i++){
2003                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2004                         CloseHandle(hThreadArray[i]);
2005                         delete pDataArray[i];
2006                 }
2007                 
2008         #endif
2009         
2010         for (int i=0;i<processIDS.size();i++) { 
2011             if (compositeFASTAFileName != "") {
2012                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2013                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2014                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2015                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2016             }
2017         }
2018         
2019         return 0;
2020         
2021     }
2022         catch(exception& e) {
2023                 m->errorOut(e, "ShhherCommand", "createProcesses");
2024                 exit(1);
2025         }
2026 }
2027 /**************************************************************************************************/
2028
2029 vector<string> ShhherCommand::parseFlowFiles(string filename){
2030     try {
2031         vector<string> files;
2032         int count = 0;
2033         
2034         ifstream in;
2035         m->openInputFile(filename, in);
2036         
2037         int thisNumFLows = 0;
2038         in >> thisNumFLows; m->gobble(in);
2039         
2040         while (!in.eof()) {
2041             if (m->control_pressed) { break; }
2042             
2043             ofstream out;
2044             string outputFileName = filename + toString(count) + ".temp";
2045             m->openOutputFile(outputFileName, out);
2046             out << thisNumFLows << endl;
2047             files.push_back(outputFileName);
2048             
2049             int numLinesWrote = 0;
2050             for (int i = 0; i < largeSize; i++) {
2051                 if (in.eof()) { break; }
2052                 string line = m->getline(in); m->gobble(in);
2053                 out << line << endl;
2054                 numLinesWrote++;
2055             }
2056             out.close();
2057             
2058             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2059             count++;
2060         }
2061         in.close();
2062         
2063         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2064         
2065         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2066         
2067         return files;
2068     }
2069         catch(exception& e) {
2070                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2071                 exit(1);
2072         }
2073 }
2074 /**************************************************************************************************/
2075
2076 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2077     try {
2078         
2079         for(int i=start;i<end;i++){
2080                         
2081                         if (m->control_pressed) { break; }
2082                         
2083             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2084             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2085             
2086             if (m->control_pressed) { break; }
2087             
2088             double begClock = clock();
2089             unsigned long long begTime;
2090             
2091             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2092                 
2093                 string flowFileName = theseFlowFileNames[g];
2094                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2095                 m->mothurOut("Reading flowgrams...\n");
2096                 
2097                 vector<string> seqNameVector;
2098                 vector<int> lengths;
2099                 vector<short> flowDataIntI;
2100                 vector<double> flowDataPrI;
2101                 map<string, int> nameMap;
2102                 vector<short> uniqueFlowgrams;
2103                 vector<int> uniqueCount;
2104                 vector<int> mapSeqToUnique;
2105                 vector<int> mapUniqueToSeq;
2106                 vector<int> uniqueLengths;
2107                 int numFlowCells;
2108                 
2109                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2110                 
2111                 if (m->control_pressed) { break; }
2112                 
2113                 m->mothurOut("Identifying unique flowgrams...\n");
2114                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2115                 
2116                 if (m->control_pressed) { break; }
2117                 
2118                 m->mothurOut("Calculating distances between flowgrams...\n");
2119                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2120                 begTime = time(NULL);
2121                
2122                 
2123                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2124                 
2125                 m->mothurOutEndLine();
2126                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2127                 
2128                 
2129                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2130                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2131                 
2132                 if (m->control_pressed) { break; }
2133                 
2134                 m->mothurOut("\nClustering flowgrams...\n");
2135                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2136                 cluster(listFileName, distFileName, namesFileName);
2137                 
2138                 if (m->control_pressed) { break; }
2139                 
2140                 vector<int> otuData;
2141                 vector<int> cumNumSeqs;
2142                 vector<int> nSeqsPerOTU;
2143                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2144                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2145                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2146                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2147                 
2148                 
2149                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2150                 
2151                 if (m->control_pressed) { break; }
2152                 
2153                 m->mothurRemove(distFileName);
2154                 m->mothurRemove(namesFileName);
2155                 m->mothurRemove(listFileName);
2156                 
2157                 vector<double> dist;            //adDist - distance of sequences to centroids
2158                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2159                 vector<int> centroids;          //the representative flowgram for each cluster m
2160                 vector<double> weight;
2161                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2162                 vector<int> nSeqsBreaks;
2163                 vector<int> nOTUsBreaks;
2164                 
2165                 dist.assign(numSeqs * numOTUs, 0);
2166                 change.assign(numOTUs, 1);
2167                 centroids.assign(numOTUs, -1);
2168                 weight.assign(numOTUs, 0);
2169                 singleTau.assign(numSeqs, 1.0);
2170                 
2171                 nSeqsBreaks.assign(2, 0);
2172                 nOTUsBreaks.assign(2, 0);
2173                 
2174                 nSeqsBreaks[0] = 0;
2175                 nSeqsBreaks[1] = numSeqs;
2176                 nOTUsBreaks[1] = numOTUs;
2177                 
2178                 if (m->control_pressed) { break; }
2179                 
2180                 double maxDelta = 0;
2181                 int iter = 0;
2182                 
2183                 begClock = clock();
2184                 begTime = time(NULL);
2185                 
2186                 m->mothurOut("\nDenoising flowgrams...\n");
2187                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2188                 
2189                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2190                     
2191                     if (m->control_pressed) { break; }
2192                     
2193                     double cycClock = clock();
2194                     unsigned long long cycTime = time(NULL);
2195                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2196                     
2197                     if (m->control_pressed) { break; }
2198                     
2199                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2200                     
2201                     if (m->control_pressed) { break; }
2202                     
2203                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2204                     
2205                     if (m->control_pressed) { break; }
2206                     
2207                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2208                     
2209                     if (m->control_pressed) { break; }
2210                     
2211                     checkCentroids(numOTUs, centroids, weight);
2212                     
2213                     if (m->control_pressed) { break; }
2214                     
2215                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2216                     
2217                     if (m->control_pressed) { break; }
2218                     
2219                     iter++;
2220                     
2221                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2222                     
2223                 }       
2224                 
2225                 if (m->control_pressed) { break; }
2226                 
2227                 m->mothurOut("\nFinalizing...\n");
2228                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2229                 
2230                 if (m->control_pressed) { break; }
2231                 
2232                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2233                 
2234                 if (m->control_pressed) { break; }
2235                 
2236                 vector<int> otuCounts(numOTUs, 0);
2237                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2238                 
2239                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2240                 
2241                 if (m->control_pressed) { break; }
2242                 
2243                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2244                 string thisOutputDir = outputDir;
2245                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2246                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
2247                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
2248                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
2249                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
2250                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2251                 string groupFileName = fileRoot + "shhh.groups";
2252
2253                 
2254                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2255                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2256                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2257                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2258                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2259                 
2260                 if (large) {
2261                     if (g > 0) {
2262                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.qual"));
2263                         m->mothurRemove(qualityFileName);
2264                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.fasta"));
2265                         m->mothurRemove(fastaFileName);
2266                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.names"));
2267                         m->mothurRemove(nameFileName);
2268                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.counts"));
2269                         m->mothurRemove(otuCountsFileName);
2270                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + "shhh.groups"));
2271                         m->mothurRemove(groupFileName);
2272                     }
2273                     m->mothurRemove(theseFlowFileNames[g]);
2274                 }
2275                         }
2276             
2277                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2278                 }
2279                 
2280         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2281         
2282         return 0;
2283         
2284     }catch(exception& e) {
2285             m->errorOut(e, "ShhherCommand", "driver");
2286             exit(1);
2287     }
2288 }
2289
2290 /**************************************************************************************************/
2291 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2292         try{
2293        
2294                 ifstream flowFile;
2295        
2296                 m->openInputFile(filename, flowFile);
2297                 
2298                 string seqName;
2299                 int currentNumFlowCells;
2300                 float intensity;
2301         thisSeqNameVector.clear();
2302                 thisLengths.clear();
2303                 thisFlowDataIntI.clear();
2304                 thisNameMap.clear();
2305                 
2306                 flowFile >> numFlowCells;
2307                 int index = 0;//pcluster
2308                 while(!flowFile.eof()){
2309                         
2310                         if (m->control_pressed) { break; }
2311                         
2312                         flowFile >> seqName >> currentNumFlowCells;
2313                         thisLengths.push_back(currentNumFlowCells);
2314            
2315                         thisSeqNameVector.push_back(seqName);
2316                         thisNameMap[seqName] = index++;//pcluster
2317
2318                         for(int i=0;i<numFlowCells;i++){
2319                                 flowFile >> intensity;
2320                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2321                                 int intI = int(100 * intensity + 0.0001);
2322                                 thisFlowDataIntI.push_back(intI);
2323                         }
2324                         m->gobble(flowFile);
2325                 }
2326                 flowFile.close();
2327                 
2328                 int numSeqs = thisSeqNameVector.size();         
2329                 
2330                 for(int i=0;i<numSeqs;i++){
2331                         
2332                         if (m->control_pressed) { break; }
2333                         
2334                         int iNumFlowCells = i * numFlowCells;
2335                         for(int j=thisLengths[i];j<numFlowCells;j++){
2336                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2337                         }
2338                 }
2339         
2340         return numSeqs;
2341                 
2342         }
2343         catch(exception& e) {
2344                 m->errorOut(e, "ShhherCommand", "getFlowData");
2345                 exit(1);
2346         }
2347 }
2348 /**************************************************************************************************/
2349
2350 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2351         try{            
2352         
2353                 ostringstream outStream;
2354                 outStream.setf(ios::fixed, ios::floatfield);
2355                 outStream.setf(ios::dec, ios::basefield);
2356                 outStream.setf(ios::showpoint);
2357                 outStream.precision(6);
2358                 
2359                 int begTime = time(NULL);
2360                 double begClock = clock();
2361         
2362                 for(int i=0;i<stopSeq;i++){
2363                         
2364                         if (m->control_pressed) { break; }
2365                         
2366                         for(int j=0;j<i;j++){
2367                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2368                 
2369                                 if(flowDistance < 1e-6){
2370                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2371                                 }
2372                                 else if(flowDistance <= cutoff){
2373                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2374                                 }
2375                         }
2376                         if(i % 100 == 0){
2377                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2378                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2379                                 m->mothurOutEndLine();
2380                         }
2381                 }
2382                 
2383                 ofstream distFile(distFileName.c_str());
2384                 distFile << outStream.str();            
2385                 distFile.close();
2386                 
2387                 if (m->control_pressed) {}
2388                 else {
2389                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2390                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2391                         m->mothurOutEndLine();
2392                 }
2393         
2394         return 0;
2395         }
2396         catch(exception& e) {
2397                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2398                 exit(1);
2399         }
2400 }
2401 /**************************************************************************************************/
2402
2403 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2404         try{
2405                 int minLength = lengths[mapSeqToUnique[seqA]];
2406                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2407                 
2408                 int ANumFlowCells = seqA * numFlowCells;
2409                 int BNumFlowCells = seqB * numFlowCells;
2410                 
2411                 float dist = 0;
2412                 
2413                 for(int i=0;i<minLength;i++){
2414                         
2415                         if (m->control_pressed) { break; }
2416                         
2417                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2418                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2419                         
2420                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2421                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2422                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2423                 }
2424                 
2425                 dist /= (float) minLength;
2426                 return dist;
2427         }
2428         catch(exception& e) {
2429                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2430                 exit(1);
2431         }
2432 }
2433
2434 /**************************************************************************************************/
2435
2436 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2437         try{
2438                 int numUniques = 0;
2439                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2440                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2441                 uniqueLengths.assign(numSeqs, 0);
2442                 mapSeqToUnique.assign(numSeqs, -1);
2443                 mapUniqueToSeq.assign(numSeqs, -1);
2444                 
2445                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2446                 
2447                 for(int i=0;i<numSeqs;i++){
2448                         
2449                         if (m->control_pressed) { break; }
2450                         
2451                         int index = 0;
2452                         
2453                         vector<short> current(numFlowCells);
2454                         for(int j=0;j<numFlowCells;j++){
2455                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2456                         }
2457             
2458                         for(int j=0;j<numUniques;j++){
2459                                 int offset = j * numFlowCells;
2460                                 bool toEnd = 1;
2461                                 
2462                                 int shorterLength;
2463                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2464                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2465                 
2466                                 for(int k=0;k<shorterLength;k++){
2467                                         if(current[k] != uniqueFlowgrams[offset + k]){
2468                                                 toEnd = 0;
2469                                                 break;
2470                                         }
2471                                 }
2472                                 
2473                                 if(toEnd){
2474                                         mapSeqToUnique[i] = j;
2475                                         uniqueCount[j]++;
2476                                         index = j;
2477                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2478                                         break;
2479                                 }
2480                                 index++;
2481                         }
2482                         
2483                         if(index == numUniques){
2484                                 uniqueLengths[numUniques] = lengths[i];
2485                                 uniqueCount[numUniques] = 1;
2486                                 mapSeqToUnique[i] = numUniques;//anMap
2487                                 mapUniqueToSeq[numUniques] = i;//anF
2488                                 
2489                                 for(int k=0;k<numFlowCells;k++){
2490                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2491                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2492                                 }
2493                                 
2494                                 numUniques++;
2495                         }
2496                 }
2497                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2498                 uniqueLengths.resize(numUniques);       
2499                 
2500                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2501                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2502         
2503         return numUniques;
2504         }
2505         catch(exception& e) {
2506                 m->errorOut(e, "ShhherCommand", "getUniques");
2507                 exit(1);
2508         }
2509 }
2510 /**************************************************************************************************/
2511 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2512         try{
2513                 
2514                 vector<string> duplicateNames(numUniques, "");
2515                 for(int i=0;i<numSeqs;i++){
2516                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2517                 }
2518                 
2519                 ofstream nameFile;
2520                 m->openOutputFile(filename, nameFile);
2521                 
2522                 for(int i=0;i<numUniques;i++){
2523                         
2524                         if (m->control_pressed) { break; }
2525                         
2526             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2527                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2528                 }
2529                 
2530                 nameFile.close();
2531         
2532                 return 0;
2533         }
2534         catch(exception& e) {
2535                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2536                 exit(1);
2537         }
2538 }
2539 //**********************************************************************************************************************
2540
2541 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2542         try {
2543                 
2544                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2545                 read->setCutoff(cutoff);
2546                 
2547                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2548                 clusterNameMap->readMap();
2549                 read->read(clusterNameMap);
2550         
2551                 ListVector* list = read->getListVector();
2552                 SparseMatrix* matrix = read->getMatrix();
2553                 
2554                 delete read; 
2555                 delete clusterNameMap; 
2556         
2557                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2558                 
2559                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2560                 string tag = cluster->getTag();
2561                 
2562                 double clusterCutoff = cutoff;
2563                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2564                         
2565                         if (m->control_pressed) { break; }
2566                         
2567                         cluster->update(clusterCutoff);
2568                 }
2569                 
2570                 list->setLabel(toString(cutoff));
2571                 
2572                 ofstream listFile;
2573                 m->openOutputFile(filename, listFile);
2574                 list->print(listFile);
2575                 listFile.close();
2576                 
2577                 delete matrix;  delete cluster; delete rabund; delete list;
2578         
2579                 return 0;
2580         }
2581         catch(exception& e) {
2582                 m->errorOut(e, "ShhherCommand", "cluster");
2583                 exit(1);        
2584         }               
2585 }
2586 /**************************************************************************************************/
2587
2588 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2589                                vector<int>& cumNumSeqs,
2590                                vector<int>& nSeqsPerOTU,
2591                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2592                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2593                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2594                                vector<int>& seqIndex,
2595                                map<string, int>& nameMap){
2596         try {
2597         
2598                 ifstream listFile;
2599                 m->openInputFile(fileName, listFile);
2600                 string label;
2601         int numOTUs;
2602                 
2603                 listFile >> label >> numOTUs;
2604         
2605                 otuData.assign(numSeqs, 0);
2606                 cumNumSeqs.assign(numOTUs, 0);
2607                 nSeqsPerOTU.assign(numOTUs, 0);
2608                 aaP.clear();aaP.resize(numOTUs);
2609                 
2610                 seqNumber.clear();
2611                 aaI.clear();
2612                 seqIndex.clear();
2613                 
2614                 string singleOTU = "";
2615                 
2616                 for(int i=0;i<numOTUs;i++){
2617                         
2618                         if (m->control_pressed) { break; }
2619             
2620                         listFile >> singleOTU;
2621                         
2622                         istringstream otuString(singleOTU);
2623             
2624                         while(otuString){
2625                                 
2626                                 string seqName = "";
2627                                 
2628                                 for(int j=0;j<singleOTU.length();j++){
2629                                         char letter = otuString.get();
2630                                         
2631                                         if(letter != ','){
2632                                                 seqName += letter;
2633                                         }
2634                                         else{
2635                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2636                                                 int index = nmIt->second;
2637                                                 
2638                                                 nameMap.erase(nmIt);
2639                                                 
2640                                                 otuData[index] = i;
2641                                                 nSeqsPerOTU[i]++;
2642                                                 aaP[i].push_back(index);
2643                                                 seqName = "";
2644                                         }
2645                                 }
2646                                 
2647                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2648                 
2649                                 int index = nmIt->second;
2650                                 nameMap.erase(nmIt);
2651                 
2652                                 otuData[index] = i;
2653                                 nSeqsPerOTU[i]++;
2654                                 aaP[i].push_back(index);        
2655                                 
2656                                 otuString.get();
2657                         }
2658                         
2659                         sort(aaP[i].begin(), aaP[i].end());
2660                         for(int j=0;j<nSeqsPerOTU[i];j++){
2661                                 seqNumber.push_back(aaP[i][j]);
2662                         }
2663                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2664                                 aaP[i].push_back(0);
2665                         }
2666                         
2667                         
2668                 }
2669                 
2670                 for(int i=1;i<numOTUs;i++){
2671                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2672                 }
2673                 aaI = aaP;
2674                 seqIndex = seqNumber;
2675                 
2676                 listFile.close();       
2677         
2678         return numOTUs;
2679                 
2680         }
2681         catch(exception& e) {
2682                 m->errorOut(e, "ShhherCommand", "getOTUData");
2683                 exit(1);        
2684         }               
2685 }
2686 /**************************************************************************************************/
2687
2688 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2689                                           vector<int>& cumNumSeqs,
2690                                           vector<int>& nSeqsPerOTU,
2691                                           vector<int>& seqIndex,
2692                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2693                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2694                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2695                                           vector<int>& mapSeqToUnique,
2696                                           vector<short>& uniqueFlowgrams,
2697                                           vector<short>& flowDataIntI,
2698                                           vector<int>& lengths,
2699                                           int numFlowCells,
2700                                           vector<int>& seqNumber){                          
2701         
2702         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2703         //within an otu
2704         
2705         try{
2706                 
2707                 for(int i=0;i<numOTUs;i++){
2708                         
2709                         if (m->control_pressed) { break; }
2710                         
2711                         double count = 0;
2712                         int position = 0;
2713                         int minFlowGram = 100000000;
2714                         double minFlowValue = 1e8;
2715                         change[i] = 0; //FALSE
2716                         
2717                         for(int j=0;j<nSeqsPerOTU[i];j++){
2718                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2719                         }
2720             
2721                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2722                                 vector<double> adF(nSeqsPerOTU[i]);
2723                                 vector<int> anL(nSeqsPerOTU[i]);
2724                                 
2725                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2726                                         int index = cumNumSeqs[i] + j;
2727                                         int nI = seqIndex[index];
2728                                         int nIU = mapSeqToUnique[nI];
2729                                         
2730                                         int k;
2731                                         for(k=0;k<position;k++){
2732                                                 if(nIU == anL[k]){
2733                                                         break;
2734                                                 }
2735                                         }
2736                                         if(k == position){
2737                                                 anL[position] = nIU;
2738                                                 adF[position] = 0.0000;
2739                                                 position++;
2740                                         }                                               
2741                                 }
2742                                 
2743                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2744                                         int index = cumNumSeqs[i] + j;
2745                                         int nI = seqIndex[index];
2746                                         
2747                                         double tauValue = singleTau[seqNumber[index]];
2748                                         
2749                                         for(int k=0;k<position;k++){
2750                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2751                                                 adF[k] += dist * tauValue;
2752                                         }
2753                                 }
2754                                 
2755                                 for(int j=0;j<position;j++){
2756                                         if(adF[j] < minFlowValue){
2757                                                 minFlowGram = j;
2758                                                 minFlowValue = adF[j];
2759                                         }
2760                                 }
2761                                 
2762                                 if(centroids[i] != anL[minFlowGram]){
2763                                         change[i] = 1;
2764                                         centroids[i] = anL[minFlowGram];
2765                                 }
2766                         }
2767                         else if(centroids[i] != -1){
2768                                 change[i] = 1;
2769                                 centroids[i] = -1;                      
2770                         }
2771                 }
2772         
2773         return 0;
2774         }
2775         catch(exception& e) {
2776                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2777                 exit(1);        
2778         }               
2779 }
2780 /**************************************************************************************************/
2781
2782 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2783                                         vector<short>& flowDataIntI, int numFlowCells){
2784         try{
2785                 
2786                 int flowAValue = cent * numFlowCells;
2787                 int flowBValue = flow * numFlowCells;
2788                 
2789                 double dist = 0;
2790         
2791                 for(int i=0;i<length;i++){
2792                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2793                         flowAValue++;
2794                         flowBValue++;
2795                 }
2796                 
2797                 return dist / (double)length;
2798         }
2799         catch(exception& e) {
2800                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2801                 exit(1);        
2802         }               
2803 }
2804 /**************************************************************************************************/
2805
2806 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2807         try{
2808                 
2809                 double maxChange = 0;
2810                 
2811                 for(int i=0;i<numOTUs;i++){
2812                         
2813                         if (m->control_pressed) { break; }
2814                         
2815                         double difference = weight[i];
2816                         weight[i] = 0;
2817                         
2818                         for(int j=0;j<nSeqsPerOTU[i];j++){
2819                                 int index = cumNumSeqs[i] + j;
2820                                 double tauValue = singleTau[seqNumber[index]];
2821                                 weight[i] += tauValue;
2822                         }
2823                         
2824                         difference = fabs(weight[i] - difference);
2825                         if(difference > maxChange){     maxChange = difference; }
2826                 }
2827                 return maxChange;
2828         }
2829         catch(exception& e) {
2830                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2831                 exit(1);        
2832         }               
2833 }
2834
2835 /**************************************************************************************************/
2836
2837 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2838         
2839         try{
2840                 
2841                 vector<long double> P(numSeqs, 0);
2842                 int effNumOTUs = 0;
2843                 
2844                 for(int i=0;i<numOTUs;i++){
2845                         if(weight[i] > MIN_WEIGHT){
2846                                 effNumOTUs++;
2847                         }
2848                 }
2849                 
2850                 string hold;
2851                 for(int i=0;i<numOTUs;i++){
2852                         
2853                         if (m->control_pressed) { break; }
2854                         
2855                         for(int j=0;j<nSeqsPerOTU[i];j++){
2856                                 int index = cumNumSeqs[i] + j;
2857                                 int nI = seqIndex[index];
2858                                 double singleDist = dist[seqNumber[index]];
2859                                 
2860                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2861                         }
2862                 }
2863                 double nLL = 0.00;
2864                 for(int i=0;i<numSeqs;i++){
2865                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2866             
2867                         nLL += -log(P[i]);
2868                 }
2869                 
2870                 nLL = nLL -(double)numSeqs * log(sigma);
2871         
2872                 return nLL; 
2873         }
2874         catch(exception& e) {
2875                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2876                 exit(1);        
2877         }               
2878 }
2879
2880 /**************************************************************************************************/
2881
2882 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2883         try{
2884                 vector<int> unique(numOTUs, 1);
2885                 
2886                 for(int i=0;i<numOTUs;i++){
2887                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2888                                 unique[i] = -1;
2889                         }
2890                 }
2891                 
2892                 for(int i=0;i<numOTUs;i++){
2893                         
2894                         if (m->control_pressed) { break; }
2895                         
2896                         if(unique[i] == 1){
2897                                 for(int j=i+1;j<numOTUs;j++){
2898                                         if(unique[j] == 1){
2899                                                 
2900                                                 if(centroids[j] == centroids[i]){
2901                                                         unique[j] = 0;
2902                                                         centroids[j] = -1;
2903                                                         
2904                                                         weight[i] += weight[j];
2905                                                         weight[j] = 0.0;
2906                                                 }
2907                                         }
2908                                 }
2909                         }
2910                 }
2911         
2912         return 0;
2913         }
2914         catch(exception& e) {
2915                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2916                 exit(1);        
2917         }               
2918 }
2919 /**************************************************************************************************/
2920
2921 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2922                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2923                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2924                                      vector<int>& seqNumber, vector<int>& seqIndex,
2925                                      vector<short>& uniqueFlowgrams,
2926                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2927         
2928         try{
2929                 
2930                 int total = 0;
2931                 vector<double> newTau(numOTUs,0);
2932                 vector<double> norms(numSeqs, 0);
2933                 nSeqsPerOTU.assign(numOTUs, 0);
2934         
2935                 for(int i=0;i<numSeqs;i++){
2936                         
2937                         if (m->control_pressed) { break; }
2938                         
2939                         int indexOffset = i * numOTUs;
2940             
2941                         double offset = 1e8;
2942                         
2943                         for(int j=0;j<numOTUs;j++){
2944                 
2945                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2946                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2947                                 }
2948                 
2949                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2950                                         offset = dist[indexOffset + j];
2951                                 }
2952                         }
2953             
2954                         for(int j=0;j<numOTUs;j++){
2955                                 if(weight[j] > MIN_WEIGHT){
2956                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2957                                         norms[i] += newTau[j];
2958                                 }
2959                                 else{
2960                                         newTau[j] = 0.0;
2961                                 }
2962                         }
2963             
2964                         for(int j=0;j<numOTUs;j++){
2965                                 newTau[j] /= norms[i];
2966                         }
2967             
2968                         for(int j=0;j<numOTUs;j++){
2969                                 if(newTau[j] > MIN_TAU){
2970                                         
2971                                         int oldTotal = total;
2972                                         
2973                                         total++;
2974                                         
2975                                         singleTau.resize(total, 0);
2976                                         seqNumber.resize(total, 0);
2977                                         seqIndex.resize(total, 0);
2978                                         
2979                                         singleTau[oldTotal] = newTau[j];
2980                                         
2981                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2982                                         aaI[j][nSeqsPerOTU[j]] = i;
2983                                         nSeqsPerOTU[j]++;
2984                                 }
2985                         }
2986             
2987                 }
2988         
2989         }
2990         catch(exception& e) {
2991                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2992                 exit(1);        
2993         }               
2994 }
2995 /**************************************************************************************************/
2996
2997 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2998         try {
2999                 int index = 0;
3000                 for(int i=0;i<numOTUs;i++){
3001                         
3002                         if (m->control_pressed) { return 0; }
3003                         
3004                         cumNumSeqs[i] = index;
3005                         for(int j=0;j<nSeqsPerOTU[i];j++){
3006                                 seqNumber[index] = aaP[i][j];
3007                                 seqIndex[index] = aaI[i][j];
3008                                 
3009                                 index++;
3010                         }
3011                 }
3012         
3013         return 0; 
3014         }
3015         catch(exception& e) {
3016                 m->errorOut(e, "ShhherCommand", "fill");
3017                 exit(1);        
3018         }               
3019 }
3020 /**************************************************************************************************/
3021
3022 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3023                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3024         
3025         try {
3026                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3027                 
3028                 for(int i=0;i<numOTUs;i++){
3029                         
3030                         if (m->control_pressed) { break; }
3031                         
3032                         for(int j=0;j<nSeqsPerOTU[i];j++){
3033                                 int index = cumNumSeqs[i] + j;
3034                                 double tauValue = singleTau[seqNumber[index]];
3035                                 int sIndex = seqIndex[index];
3036                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3037                         }
3038                 }
3039                 
3040                 for(int i=0;i<numSeqs;i++){
3041                         double maxTau = -1.0000;
3042                         int maxOTU = -1;
3043                         for(int j=0;j<numOTUs;j++){
3044                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3045                                         maxTau = bigTauMatrix[i * numOTUs + j];
3046                                         maxOTU = j;
3047                                 }
3048                         }
3049                         
3050                         otuData[i] = maxOTU;
3051                 }
3052                 
3053                 nSeqsPerOTU.assign(numOTUs, 0);         
3054                 
3055                 for(int i=0;i<numSeqs;i++){
3056                         int index = otuData[i];
3057                         
3058                         singleTau[i] = 1.0000;
3059                         dist[i] = 0.0000;
3060                         
3061                         aaP[index][nSeqsPerOTU[index]] = i;
3062                         aaI[index][nSeqsPerOTU[index]] = i;
3063                         
3064                         nSeqsPerOTU[index]++;
3065                 }
3066         
3067                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3068         }
3069         catch(exception& e) {
3070                 m->errorOut(e, "ShhherCommand", "setOTUs");
3071                 exit(1);        
3072         }               
3073 }
3074 /**************************************************************************************************/
3075
3076 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3077                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3078                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3079         
3080         try {
3081         
3082                 ofstream qualityFile;
3083                 m->openOutputFile(qualityFileName, qualityFile);
3084         
3085                 qualityFile.setf(ios::fixed, ios::floatfield);
3086                 qualityFile.setf(ios::showpoint);
3087                 qualityFile << setprecision(6);
3088                 
3089                 vector<vector<int> > qualities(numOTUs);
3090                 vector<double> pr(HOMOPS, 0);
3091                 
3092                 
3093                 for(int i=0;i<numOTUs;i++){
3094                         
3095                         if (m->control_pressed) { break; }
3096                         
3097                         int index = 0;
3098                         int base = 0;
3099                         
3100                         if(nSeqsPerOTU[i] > 0){
3101                                 qualities[i].assign(1024, -1);
3102                                 
3103                                 while(index < numFlowCells){
3104                                         double maxPrValue = 1e8;
3105                                         short maxPrIndex = -1;
3106                                         double count = 0.0000;
3107                                         
3108                                         pr.assign(HOMOPS, 0);
3109                                         
3110                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3111                                                 int lIndex = cumNumSeqs[i] + j;
3112                                                 double tauValue = singleTau[seqNumber[lIndex]];
3113                                                 int sequenceIndex = aaI[i][j];
3114                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3115                                                 
3116                                                 count += tauValue;
3117                                                 
3118                                                 for(int s=0;s<HOMOPS;s++){
3119                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3120                                                 }
3121                                         }
3122                                         
3123                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3124                                         maxPrValue = pr[maxPrIndex];
3125                                         
3126                                         if(count > MIN_COUNT){
3127                                                 double U = 0.0000;
3128                                                 double norm = 0.0000;
3129                                                 
3130                                                 for(int s=0;s<HOMOPS;s++){
3131                                                         norm += exp(-(pr[s] - maxPrValue));
3132                                                 }
3133                                                 
3134                                                 for(int s=1;s<=maxPrIndex;s++){
3135                                                         int value = 0;
3136                                                         double temp = 0.0000;
3137                                                         
3138                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3139                                                         
3140                                                         if(U>0.00){
3141                                                                 temp = log10(U);
3142                                                         }
3143                                                         else{
3144                                                                 temp = -10.1;
3145                                                         }
3146                                                         temp = floor(-10 * temp);
3147                                                         value = (int)floor(temp);
3148                                                         if(value > 100){        value = 100;    }
3149                                                         
3150                                                         qualities[i][base] = (int)value;
3151                                                         base++;
3152                                                 }
3153                                         }
3154                                         
3155                                         index++;
3156                                 }
3157                         }
3158                         
3159                         
3160                         if(otuCounts[i] > 0){
3161                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3162                         
3163                                 int j=4;        //need to get past the first four bases
3164                                 while(qualities[i][j] != -1){
3165                     qualityFile << qualities[i][j] << ' ';
3166                     if (j > qualities[i].size()) { break; }
3167                     j++;
3168                                 }
3169                                 qualityFile << endl;
3170                         }
3171                 }
3172                 qualityFile.close();
3173                 outputNames.push_back(qualityFileName);
3174         
3175         }
3176         catch(exception& e) {
3177                 m->errorOut(e, "ShhherCommand", "writeQualities");
3178                 exit(1);        
3179         }               
3180 }
3181
3182 /**************************************************************************************************/
3183
3184 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3185         try {
3186                 
3187                 ofstream fastaFile;
3188                 m->openOutputFile(fastaFileName, fastaFile);
3189                 
3190                 vector<string> names(numOTUs, "");
3191                 
3192                 for(int i=0;i<numOTUs;i++){
3193                         
3194                         if (m->control_pressed) { break; }
3195                         
3196                         int index = centroids[i];
3197                         
3198                         if(otuCounts[i] > 0){
3199                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3200                                 
3201                                 string newSeq = "";
3202                                 
3203                                 for(int j=0;j<numFlowCells;j++){
3204                                         
3205                                         char base = flowOrder[j % 4];
3206                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3207                                                 newSeq += base;
3208                                         }
3209                                 }
3210                                 
3211                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3212                 else {  fastaFile << "NNNN" << endl;  }
3213                         }
3214                 }
3215                 fastaFile.close();
3216         
3217                 outputNames.push_back(fastaFileName);
3218         
3219                 if(thisCompositeFASTAFileName != ""){
3220                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3221                 }
3222         }
3223         catch(exception& e) {
3224                 m->errorOut(e, "ShhherCommand", "writeSequences");
3225                 exit(1);        
3226         }               
3227 }
3228
3229 /**************************************************************************************************/
3230
3231 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3232         try {
3233                 
3234                 ofstream nameFile;
3235                 m->openOutputFile(nameFileName, nameFile);
3236                 
3237                 for(int i=0;i<numOTUs;i++){
3238                         
3239                         if (m->control_pressed) { break; }
3240                         
3241                         if(otuCounts[i] > 0){
3242                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3243                                 
3244                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3245                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3246                                 }
3247                                 
3248                                 nameFile << endl;
3249                         }
3250                 }
3251                 nameFile.close();
3252                 outputNames.push_back(nameFileName);
3253                 
3254                 
3255                 if(thisCompositeNamesFileName != ""){
3256                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3257                 }               
3258         }
3259         catch(exception& e) {
3260                 m->errorOut(e, "ShhherCommand", "writeNames");
3261                 exit(1);        
3262         }               
3263 }
3264
3265 /**************************************************************************************************/
3266
3267 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3268         try {
3269         ofstream groupFile;
3270                 m->openOutputFile(groupFileName, groupFile);
3271                 
3272                 for(int i=0;i<numSeqs;i++){
3273                         if (m->control_pressed) { break; }
3274                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3275                 }
3276                 groupFile.close();
3277                 outputNames.push_back(groupFileName);
3278         
3279         }
3280         catch(exception& e) {
3281                 m->errorOut(e, "ShhherCommand", "writeGroups");
3282                 exit(1);        
3283         }               
3284 }
3285
3286 /**************************************************************************************************/
3287
3288 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3289         try {
3290                 ofstream otuCountsFile;
3291                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3292                 
3293                 string bases = flowOrder;
3294                 
3295                 for(int i=0;i<numOTUs;i++){
3296                         
3297                         if (m->control_pressed) {
3298                                 break;
3299                         }
3300                         //output the translated version of the centroid sequence for the otu
3301                         if(otuCounts[i] > 0){
3302                                 int index = centroids[i];
3303                                 
3304                                 otuCountsFile << "ideal\t";
3305                                 for(int j=8;j<numFlowCells;j++){
3306                                         char base = bases[j % 4];
3307                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3308                                                 otuCountsFile << base;
3309                                         }
3310                                 }
3311                                 otuCountsFile << endl;
3312                                 
3313                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3314                                         int sequence = aaI[i][j];
3315                                         otuCountsFile << seqNameVector[sequence] << '\t';
3316                                         
3317                                         string newSeq = "";
3318                                         
3319                                         for(int k=0;k<lengths[sequence];k++){
3320                                                 char base = bases[k % 4];
3321                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3322                         
3323                                                 for(int s=0;s<freq;s++){
3324                                                         newSeq += base;
3325                                                         //otuCountsFile << base;
3326                                                 }
3327                                         }
3328                                         
3329                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3330                     else {  otuCountsFile << "NNNN" << endl;  }
3331                                 }
3332                                 otuCountsFile << endl;
3333                         }
3334                 }
3335                 otuCountsFile.close();
3336                 outputNames.push_back(otuCountsFileName);
3337         
3338         }
3339         catch(exception& e) {
3340                 m->errorOut(e, "ShhherCommand", "writeClusters");
3341                 exit(1);        
3342         }               
3343 }
3344
3345 /**************************************************************************************************/
3346
3347 void ShhherCommand::getSingleLookUp(){
3348         try{
3349                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3350                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3351                 
3352                 int index = 0;
3353                 ifstream lookUpFile;
3354                 m->openInputFile(lookupFileName, lookUpFile);
3355                 
3356                 for(int i=0;i<HOMOPS;i++){
3357                         
3358                         if (m->control_pressed) { break; }
3359                         
3360                         float logFracFreq;
3361                         lookUpFile >> logFracFreq;
3362                         
3363                         for(int j=0;j<NUMBINS;j++)      {
3364                                 lookUpFile >> singleLookUp[index];
3365                                 index++;
3366                         }
3367                 }       
3368                 lookUpFile.close();
3369         }
3370         catch(exception& e) {
3371                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3372                 exit(1);
3373         }
3374 }
3375
3376 /**************************************************************************************************/
3377
3378 void ShhherCommand::getJointLookUp(){
3379         try{
3380                 
3381                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3382                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3383                 
3384                 for(int i=0;i<NUMBINS;i++){
3385                         
3386                         if (m->control_pressed) { break; }
3387                         
3388                         for(int j=0;j<NUMBINS;j++){             
3389                                 
3390                                 double minSum = 100000000;
3391                                 
3392                                 for(int k=0;k<HOMOPS;k++){
3393                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3394                                         
3395                                         if(sum < minSum)        {       minSum = sum;           }
3396                                 }       
3397                                 jointLookUp[i * NUMBINS + j] = minSum;
3398                         }
3399                 }
3400         }
3401         catch(exception& e) {
3402                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3403                 exit(1);
3404         }
3405 }
3406
3407 /**************************************************************************************************/
3408
3409 double ShhherCommand::getProbIntensity(int intIntensity){                          
3410         try{
3411
3412                 double minNegLogProb = 100000000; 
3413
3414                 
3415                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3416                         
3417                         if (m->control_pressed) { break; }
3418                         
3419                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3420                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3421                 }
3422                 
3423                 return minNegLogProb;
3424         }
3425         catch(exception& e) {
3426                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3427                 exit(1);
3428         }
3429 }
3430
3431
3432
3433