]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
Merge remote-tracking branch 'origin/master'
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41                 return helpString;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ShhherCommand", "getHelpString");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49
50 ShhherCommand::ShhherCommand(){ 
51         try {
52                 abort = true; calledHelp = true;
53                 setParameters();
54                 
55                 //initialize outputTypes
56 //              vector<string> tempOutNames;
57 //              outputTypes["pn.dist"] = tempOutNames;
58
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
62                 exit(1);
63         }
64 }
65
66 //**********************************************************************************************************************
67
68 ShhherCommand::ShhherCommand(string option) {
69         try {
70         
71 #ifdef USE_MPI
72                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
74         
75                 if(pid == 0){
76 #endif
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99 //                      outputTypes["pn.dist"] = tempOutNames;
100                         //                      outputTypes["fasta"] = tempOutNames;
101                         
102                         //if the user changes the input directory command factory will send this info to us in the output parameter 
103                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
104                         if (inputDir == "not found"){   inputDir = "";          }
105                         else {
106                                 string path;
107                                 it = parameters.find("flow");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
113                                 }
114                                 
115                                 it = parameters.find("lookup");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
121                                 }
122
123                                 it = parameters.find("file");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
129                                 }
130                         }
131             
132             //if the user changes the output directory command factory will send this info to us in the output parameter 
133                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
134             
135                         //check for required parameters
136                         flowFileName = validParameter.validFile(parameters, "flow", true);
137                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
138                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
139                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
140                                 m->mothurOutEndLine();
141                                 abort = true; 
142                         }
143                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
144                         
145                         if(flowFileName != "not found"){
146                                 compositeFASTAFileName = "";    
147                                 compositeNamesFileName = "";    
148                         }
149                         else{
150                                 ofstream temp;
151                 
152                 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
153                 
154                                 //flow.files = 9 character offset
155                                 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
156                                 m->openOutputFile(compositeFASTAFileName, temp);
157                                 temp.close();
158                                 
159                                 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
160                                 m->openOutputFile(compositeNamesFileName, temp);
161                                 temp.close();
162                         }
163             
164             if(flowFilesFileName != "not found"){
165                 string fName;
166                 
167                 ifstream flowFilesFile;
168                 m->openInputFile(flowFilesFileName, flowFilesFile);
169                 while(flowFilesFile){
170                     fName = m->getline(flowFilesFile);
171                     
172                     //test if file is valid
173                     ifstream in;
174                     int ableToOpen = m->openInputFile(fName, in, "noerror");
175                     in.close(); 
176                     if (ableToOpen == 1) {
177                         if (inputDir != "") { //default path is set
178                             string tryPath = inputDir + fName;
179                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
180                             ifstream in2;
181                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
182                             in2.close();
183                             fName = tryPath;
184                         }
185                     }
186                     
187                     if (ableToOpen == 1) {
188                         if (m->getDefaultPath() != "") { //default path is set
189                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
190                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
191                             ifstream in2;
192                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
193                             in2.close();
194                             fName = tryPath;
195                         }
196                     }
197                     
198                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
199                     if (ableToOpen == 1) {
200                         string exepath = m->argv;
201                         string tempPath = exepath;
202                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
203                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
204                         
205                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
206                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
207                         ifstream in2;
208                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
209                         in2.close();
210                         fName = tryPath;
211                     }
212                     
213                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
214                     else { flowFileVector.push_back(fName); }
215                     m->gobble(flowFilesFile);
216                 }
217                 flowFilesFile.close();
218                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
219             }
220             else{
221                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
222                 flowFileVector.push_back(flowFileName);
223             }
224                 
225                         //check for optional parameter and set defaults
226                         // ...at some point should added some additional type checking...
227                         string temp;
228                         temp = validParameter.validFile(parameters, "lookup", true);
229                         if (temp == "not found")        {       
230                                 lookupFileName = "LookUp_Titanium.pat"; 
231                                 
232                                 int ableToOpen;
233                                 ifstream in;
234                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
235                                 in.close();     
236                                 
237                                 //if you can't open it, try input location
238                                 if (ableToOpen == 1) {
239                                         if (inputDir != "") { //default path is set
240                                                 string tryPath = inputDir + lookupFileName;
241                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
242                                                 ifstream in2;
243                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
244                                                 in2.close();
245                                                 lookupFileName = tryPath;
246                                         }
247                                 }
248                                 
249                                 //if you can't open it, try default location
250                                 if (ableToOpen == 1) {
251                                         if (m->getDefaultPath() != "") { //default path is set
252                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
253                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
254                                                 ifstream in2;
255                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
256                                                 in2.close();
257                                                 lookupFileName = tryPath;
258                                         }
259                                 }
260                                 
261                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
262                                 if (ableToOpen == 1) {
263                                         string exepath = m->argv;
264                                         string tempPath = exepath;
265                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
266                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
267                                         
268                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
269                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
270                                         ifstream in2;
271                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
272                                         in2.close();
273                                         lookupFileName = tryPath;
274                                 }
275                                 
276                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
277                         }
278                         else if(temp == "not open")     {       
279                                 
280                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
281                                 
282                                 //if you can't open it its not inputDir, try mothur excutable location
283                                 string exepath = m->argv;
284                                 string tempPath = exepath;
285                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
286                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
287                                         
288                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
289                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
290                                 ifstream in2;
291                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
292                                 in2.close();
293                                 lookupFileName = tryPath;
294                                 
295                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
296                         }else                                           {       lookupFileName = temp;  }
297                         
298                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
299                         m->setProcessors(temp);
300                         m->mothurConvert(temp, processors);
301
302                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
303                         m->mothurConvert(temp, cutoff); 
304                         
305                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
306                         m->mothurConvert(temp, minDelta); 
307
308                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
309                         m->mothurConvert(temp, maxIters); 
310
311                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
312                         m->mothurConvert(temp, sigma); 
313                         
314                         flowOrder = validParameter.validFile(parameters, "order", false);
315                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
316                         else if(flowOrder.length() != 4){
317                                 m->mothurOut("The value of the order option must be four bases long\n");
318                         }
319                         
320                 }
321 #ifdef USE_MPI
322                 }                               
323 #endif
324         }
325         catch(exception& e) {
326                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
327                 exit(1);
328         }
329 }
330 //**********************************************************************************************************************
331 #ifdef USE_MPI
332 int ShhherCommand::execute(){
333         try {
334                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
335                 
336                 int tag = 1976;
337                 MPI_Status status; 
338                         
339                 if(pid == 0){
340
341                         for(int i=1;i<ncpus;i++){
342                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
343                         }
344                         if(abort == 1){ return 0;       }
345
346                         processors = ncpus;
347                         
348                         m->mothurOut("\nGetting preliminary data...\n");
349                         getSingleLookUp();      if (m->control_pressed) { return 0; }
350                         getJointLookUp();       if (m->control_pressed) { return 0; }
351                         
352             vector<string> flowFileVector;
353                         if(flowFilesFileName != "not found"){
354                                 string fName;
355                 
356                                 ifstream flowFilesFile;
357                                 m->openInputFile(flowFilesFileName, flowFilesFile);
358                                 while(flowFilesFile){
359                                         fName = m->getline(flowFilesFile);
360                                         flowFileVector.push_back(fName);
361                                         m->gobble(flowFilesFile);
362                                 }
363                         }
364                         else{
365                                 flowFileVector.push_back(flowFileName);
366                         }
367             
368                         int numFiles = flowFileVector.size();
369
370                         for(int i=1;i<ncpus;i++){
371                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
372                         }
373                         
374                         for(int i=0;i<numFiles;i++){
375                                 
376                                 if (m->control_pressed) { break; }
377                                 
378                                 double begClock = clock();
379                                 unsigned long long begTime = time(NULL);
380
381                                 flowFileName = flowFileVector[i];
382                                 
383                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
384                                 m->mothurOut("Reading flowgrams...\n");
385                                 getFlowData();
386                                 
387                                 if (m->control_pressed) { break; }
388
389                                 m->mothurOut("Identifying unique flowgrams...\n");
390                                 getUniques();
391                                 
392                                 if (m->control_pressed) { break; }
393
394                                 m->mothurOut("Calculating distances between flowgrams...\n");
395                                 char fileName[1024];
396                                 strcpy(fileName, flowFileName.c_str());
397
398                                 for(int i=1;i<ncpus;i++){
399                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
400
401                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
405                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
406                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
407                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
408                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
409                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
411                                 }                               
412                                                         
413                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
414                                 
415                                 if (m->control_pressed) { break; }
416                                 
417                                 int done;
418                                 for(int i=1;i<ncpus;i++){
419                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
420                                         
421                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
422                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
423                                 }
424
425                                 string namesFileName = createNamesFile();
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 m->mothurOut("\nClustering flowgrams...\n");
430                                 string listFileName = cluster(distFileName, namesFileName);
431                                 
432                                 if (m->control_pressed) { break; }
433                                 
434                 
435                 
436                                 getOTUData(listFileName);
437
438                                 m->mothurRemove(distFileName);
439                                 m->mothurRemove(namesFileName);
440                                 m->mothurRemove(listFileName);
441                                 
442                                 if (m->control_pressed) { break; }
443                                 
444                                 initPyroCluster();
445
446                                 if (m->control_pressed) { break; }
447                                 
448             
449                                 for(int i=1;i<ncpus;i++){
450                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
451                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
452                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
453                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
454                                 }
455                                 
456                                 if (m->control_pressed) { break; }
457                                 
458                                 double maxDelta = 0;
459                                 int iter = 0;
460                                 
461                                 int numOTUsOnCPU = numOTUs / ncpus;
462                                 int numSeqsOnCPU = numSeqs / ncpus;
463                                 m->mothurOut("\nDenoising flowgrams...\n");
464                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
465                                 
466                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
467                                         
468                                         double cycClock = clock();
469                                         unsigned long long cycTime = time(NULL);
470                                         fill();
471                                         
472                                         if (m->control_pressed) { break; }
473
474                                         int total = singleTau.size();
475                                         for(int i=1;i<ncpus;i++){
476                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
477                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
478                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
479                                                 
480                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
481                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
482                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
483                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
484                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
485                                         }
486                                         
487                                         calcCentroidsDriver(0, numOTUsOnCPU);
488                                         
489                                         for(int i=1;i<ncpus;i++){
490                                                 int otuStart = i * numOTUs / ncpus;
491                                                 int otuStop = (i + 1) * numOTUs / ncpus;
492                                                 
493                                                 vector<int> tempCentroids(numOTUs, 0);
494                                                 vector<short> tempChange(numOTUs, 0);
495                                                 
496                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
497                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
498                                                 
499                                                 for(int j=otuStart;j<otuStop;j++){
500                                                         centroids[j] = tempCentroids[j];
501                                                         change[j] = tempChange[j];
502                                                 }
503                                         }
504                                                                         
505                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
506                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
507                                         checkCentroids(); if (m->control_pressed) { break; }
508                                         
509                                         for(int i=1;i<ncpus;i++){
510                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
511                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
512                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
513                                         }
514                                         
515                                         calcNewDistancesParent(0, numSeqsOnCPU);
516
517                                         total = singleTau.size();
518
519                                         for(int i=1;i<ncpus;i++){
520                                                 int childTotal;
521                                                 int seqStart = i * numSeqs / ncpus;
522                                                 int seqStop = (i + 1) * numSeqs / ncpus;
523                                                 
524                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
525
526                                                 vector<int> childSeqIndex(childTotal, 0);
527                                                 vector<double> childSingleTau(childTotal, 0);
528                                                 vector<double> childDist(numSeqs * numOTUs, 0);
529                                                 vector<int> otuIndex(childTotal, 0);
530                                                 
531                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
532                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
533                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
534                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
535
536                                                 int oldTotal = total;
537                                                 total += childTotal;
538                                                 singleTau.resize(total, 0);
539                                                 seqIndex.resize(total, 0);
540                                                 seqNumber.resize(total, 0);
541                                                 
542                                                 int childIndex = 0;
543                                                 
544                                                 for(int j=oldTotal;j<total;j++){
545                                                         int otuI = otuIndex[childIndex];
546                                                         int seqI = childSeqIndex[childIndex];
547                                                         
548                                                         singleTau[j] = childSingleTau[childIndex];
549                                                         
550                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
551                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
552                                                         nSeqsPerOTU[otuI]++;
553                                                         childIndex++;
554                                                 }
555                                                 
556                                                 int index = seqStart * numOTUs;
557                                                 for(int j=seqStart;j<seqStop;j++){
558                                                         for(int k=0;k<numOTUs;k++){
559                                                                 dist[index] = childDist[index];
560                                                                 index++;
561                                                         }
562                                                 }                                       
563                                         }
564                                         
565                                         iter++;
566                                         
567                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
568
569                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
570                                                 int live = 1;
571                                                 for(int i=1;i<ncpus;i++){
572                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
573                                                 }
574                                         }
575                                         else{
576                                                 int live = 0;
577                                                 for(int i=1;i<ncpus;i++){
578                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
579                                                 }
580                                         }
581                                         
582                                 }       
583                                 
584                                 if (m->control_pressed) { break; }
585                                 
586                                 m->mothurOut("\nFinalizing...\n");
587                                 fill();
588                                 
589                                 if (m->control_pressed) { break; }
590                                 
591                                 setOTUs();
592                                 
593                                 vector<int> otuCounts(numOTUs, 0);
594                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
595                                 calcCentroidsDriver(0, numOTUs);
596                                 
597                                 if (m->control_pressed) { break; }
598                                 
599                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
600                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
601                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
602                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
603                                 writeGroups();                          if (m->control_pressed) { break; }
604                                 
605                                                                  
606                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
607                         }
608                 }
609                 else{
610                         int abort = 1;
611
612                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
613                         if(abort){      return 0;       }
614
615                         int numFiles;
616                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
617
618                         for(int i=0;i<numFiles;i++){
619                                 
620                                 if (m->control_pressed) { break; }
621                                 
622                                 //Now into the pyrodist part
623                                 bool live = 1;
624
625                                 char fileName[1024];
626                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
627                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
628                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
629                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
630                                 
631                                 flowDataIntI.resize(numSeqs * numFlowCells);
632                                 flowDataPrI.resize(numSeqs * numFlowCells);
633                                 mapUniqueToSeq.resize(numSeqs);
634                                 mapSeqToUnique.resize(numSeqs);
635                                 lengths.resize(numSeqs);
636                                 jointLookUp.resize(NUMBINS * NUMBINS);
637
638                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
639                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
640                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
641                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
642                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
643                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
644                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
645
646                                 flowFileName = string(fileName);
647                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
648                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
649                                 
650                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
651                                 
652                                 if (m->control_pressed) { break; }
653                                 
654                                 int done = 1;
655                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
656
657                                 //Now into the pyronoise part
658                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
659                                 
660                                 singleLookUp.resize(HOMOPS * NUMBINS);
661                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
662                                 weight.resize(numOTUs);
663                                 centroids.resize(numOTUs);
664                                 change.resize(numOTUs);
665                                 dist.assign(numOTUs * numSeqs, 0);
666                                 nSeqsPerOTU.resize(numOTUs);
667                                 cumNumSeqs.resize(numOTUs);
668
669                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
670                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
671                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
672                                 
673                                 int startOTU = pid * numOTUs / ncpus;
674                                 int endOTU = (pid + 1) * numOTUs / ncpus;
675
676                                 int startSeq = pid * numSeqs / ncpus;
677                                 int endSeq = (pid + 1) * numSeqs /ncpus;
678                                 
679                                 int total;
680
681                                 while(live){
682                                         
683                                         if (m->control_pressed) { break; }
684                                         
685                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686                                         singleTau.assign(total, 0.0000);
687                                         seqNumber.assign(total, 0);
688                                         seqIndex.assign(total, 0);
689
690                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
691                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
693                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
694                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
695                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
696                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
697
698                                         calcCentroidsDriver(startOTU, endOTU);
699
700                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
701                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
702
703                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
704                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
705                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
706
707                                         vector<int> otuIndex(total, 0);
708                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
709                                         total = otuIndex.size();
710                                         
711                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
712                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
713                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
714                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
715                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
716                                 
717                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
718                                 }
719                         }
720                 }
721                 
722                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
723                 
724                 MPI_Barrier(MPI_COMM_WORLD);
725
726                 
727                 if(compositeFASTAFileName != ""){
728                         outputNames.push_back(compositeFASTAFileName);
729                         outputNames.push_back(compositeNamesFileName);
730                 }
731
732                 m->mothurOutEndLine();
733                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
734                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
735                 m->mothurOutEndLine();
736                 
737                 return 0;
738
739         }
740         catch(exception& e) {
741                 m->errorOut(e, "ShhherCommand", "execute");
742                 exit(1);
743         }
744 }
745 /**************************************************************************************************/
746 string ShhherCommand::createNamesFile(){
747         try{
748                 
749                 vector<string> duplicateNames(numUniques, "");
750                 for(int i=0;i<numSeqs;i++){
751                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
752                 }
753                 
754                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
755                 
756                 ofstream nameFile;
757                 m->openOutputFile(nameFileName, nameFile);
758                 
759                 for(int i=0;i<numUniques;i++){
760                         
761                         if (m->control_pressed) { break; }
762                         
763             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
764                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
765                 }
766                 
767                 nameFile.close();
768                 return  nameFileName;
769         }
770         catch(exception& e) {
771                 m->errorOut(e, "ShhherCommand", "createNamesFile");
772                 exit(1);
773         }
774 }
775 /**************************************************************************************************/
776
777 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
778         try{            
779                 ostringstream outStream;
780                 outStream.setf(ios::fixed, ios::floatfield);
781                 outStream.setf(ios::dec, ios::basefield);
782                 outStream.setf(ios::showpoint);
783                 outStream.precision(6);
784                 
785                 int begTime = time(NULL);
786                 double begClock = clock();
787                 
788                 for(int i=startSeq;i<stopSeq;i++){
789                         
790                         if (m->control_pressed) { break; }
791                         
792                         for(int j=0;j<i;j++){
793                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
794                                 
795                                 if(flowDistance < 1e-6){
796                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
797                                 }
798                                 else if(flowDistance <= cutoff){
799                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
800                                 }
801                         }
802                         if(i % 100 == 0){
803                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
804                         }
805                 }
806                 
807                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
808                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
809                 
810                 if (m->control_pressed) { return fDistFileName; }
811                 
812                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
813
814                 ofstream distFile(fDistFileName.c_str());
815                 distFile << outStream.str();            
816                 distFile.close();
817                 
818                 return fDistFileName;
819         }
820         catch(exception& e) {
821                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
822                 exit(1);
823         }
824 }
825 /**************************************************************************************************/
826  
827 void ShhherCommand::getOTUData(string listFileName){
828     try {
829         
830         ifstream listFile;
831         m->openInputFile(listFileName, listFile);
832         string label;
833         
834         listFile >> label >> numOTUs;
835         
836         otuData.assign(numSeqs, 0);
837         cumNumSeqs.assign(numOTUs, 0);
838         nSeqsPerOTU.assign(numOTUs, 0);
839         aaP.clear();aaP.resize(numOTUs);
840         
841         seqNumber.clear();
842         aaI.clear();
843         seqIndex.clear();
844         
845         string singleOTU = "";
846         
847         for(int i=0;i<numOTUs;i++){
848             
849             if (m->control_pressed) { break; }
850             
851             listFile >> singleOTU;
852             
853             istringstream otuString(singleOTU);
854             
855             while(otuString){
856                 
857                 string seqName = "";
858                 
859                 for(int j=0;j<singleOTU.length();j++){
860                     char letter = otuString.get();
861                     
862                     if(letter != ','){
863                         seqName += letter;
864                     }
865                     else{
866                         map<string,int>::iterator nmIt = nameMap.find(seqName);
867                         int index = nmIt->second;
868                         
869                         nameMap.erase(nmIt);
870                         
871                         otuData[index] = i;
872                         nSeqsPerOTU[i]++;
873                         aaP[i].push_back(index);
874                         seqName = "";
875                     }
876                 }
877                 
878                 map<string,int>::iterator nmIt = nameMap.find(seqName);
879                 
880                 int index = nmIt->second;
881                 nameMap.erase(nmIt);
882                 
883                 otuData[index] = i;
884                 nSeqsPerOTU[i]++;
885                 aaP[i].push_back(index);        
886                 
887                 otuString.get();
888             }
889             
890             sort(aaP[i].begin(), aaP[i].end());
891             for(int j=0;j<nSeqsPerOTU[i];j++){
892                 seqNumber.push_back(aaP[i][j]);
893             }
894             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
895                 aaP[i].push_back(0);
896             }
897             
898             
899         }
900         
901         for(int i=1;i<numOTUs;i++){
902             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
903         }
904         aaI = aaP;
905         seqIndex = seqNumber;
906         
907         listFile.close();       
908         
909     }
910     catch(exception& e) {
911         m->errorOut(e, "ShhherCommand", "getOTUData");
912         exit(1);        
913     }           
914 }
915
916 /**************************************************************************************************/
917
918 void ShhherCommand::initPyroCluster(){                          
919     try{
920         if (numOTUs < processors) { processors = 1; }
921         
922         dist.assign(numSeqs * numOTUs, 0);
923         change.assign(numOTUs, 1);
924         centroids.assign(numOTUs, -1);
925         weight.assign(numOTUs, 0);
926         singleTau.assign(numSeqs, 1.0);
927         
928         nSeqsBreaks.assign(processors+1, 0);
929         nOTUsBreaks.assign(processors+1, 0);
930         
931         nSeqsBreaks[0] = 0;
932         for(int i=0;i<processors;i++){
933             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
934             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
935         }
936         nSeqsBreaks[processors] = numSeqs;
937         nOTUsBreaks[processors] = numOTUs;
938     }
939     catch(exception& e) {
940         m->errorOut(e, "ShhherCommand", "initPyroCluster");
941         exit(1);        
942     }
943 }
944
945 /**************************************************************************************************/
946
947 void ShhherCommand::fill(){
948     try {
949         int index = 0;
950         for(int i=0;i<numOTUs;i++){
951             
952             if (m->control_pressed) { break; }
953             
954             cumNumSeqs[i] = index;
955             for(int j=0;j<nSeqsPerOTU[i];j++){
956                 seqNumber[index] = aaP[i][j];
957                 seqIndex[index] = aaI[i][j];
958                 
959                 index++;
960             }
961         }
962     }
963     catch(exception& e) {
964         m->errorOut(e, "ShhherCommand", "fill");
965         exit(1);        
966     }           
967 }
968
969 /**************************************************************************************************/
970  
971 void ShhherCommand::getFlowData(){
972     try{
973         ifstream flowFile;
974         m->openInputFile(flowFileName, flowFile);
975         
976         string seqName;
977         seqNameVector.clear();
978         lengths.clear();
979         flowDataIntI.clear();
980         nameMap.clear();
981         
982         
983         int currentNumFlowCells;
984         
985         float intensity;
986         
987         flowFile >> numFlowCells;
988         int index = 0;//pcluster
989         while(!flowFile.eof()){
990             
991             if (m->control_pressed) { break; }
992             
993             flowFile >> seqName >> currentNumFlowCells;
994             lengths.push_back(currentNumFlowCells);
995             
996             seqNameVector.push_back(seqName);
997             nameMap[seqName] = index++;//pcluster
998             
999             for(int i=0;i<numFlowCells;i++){
1000                 flowFile >> intensity;
1001                 if(intensity > 9.99)    {       intensity = 9.99;       }
1002                 int intI = int(100 * intensity + 0.0001);
1003                 flowDataIntI.push_back(intI);
1004             }
1005             m->gobble(flowFile);
1006         }
1007         flowFile.close();
1008         
1009         numSeqs = seqNameVector.size();         
1010         
1011         for(int i=0;i<numSeqs;i++){
1012             
1013             if (m->control_pressed) { break; }
1014             
1015             int iNumFlowCells = i * numFlowCells;
1016             for(int j=lengths[i];j<numFlowCells;j++){
1017                 flowDataIntI[iNumFlowCells + j] = 0;
1018             }
1019         }
1020         
1021     }
1022     catch(exception& e) {
1023         m->errorOut(e, "ShhherCommand", "getFlowData");
1024         exit(1);
1025     }
1026 }
1027 /**************************************************************************************************/
1028 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1029         
1030         try{
1031                 vector<double> newTau(numOTUs,0);
1032                 vector<double> norms(numSeqs, 0);
1033                 otuIndex.clear();
1034                 seqIndex.clear();
1035                 singleTau.clear();
1036                 
1037                 for(int i=startSeq;i<stopSeq;i++){
1038                         
1039                         if (m->control_pressed) { break; }
1040                         
1041                         double offset = 1e8;
1042                         int indexOffset = i * numOTUs;
1043                         
1044                         for(int j=0;j<numOTUs;j++){
1045                                 
1046                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1047                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1048                                 }
1049                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1050                                         offset = dist[indexOffset + j];
1051                                 }
1052                         }
1053                         
1054                         for(int j=0;j<numOTUs;j++){
1055                                 if(weight[j] > MIN_WEIGHT){
1056                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1057                                         norms[i] += newTau[j];
1058                                 }
1059                                 else{
1060                                         newTau[j] = 0.0;
1061                                 }
1062                         }
1063                         
1064                         for(int j=0;j<numOTUs;j++){
1065                 
1066                                 newTau[j] /= norms[i];
1067                                 
1068                                 if(newTau[j] > MIN_TAU){
1069                                         otuIndex.push_back(j);
1070                                         seqIndex.push_back(i);
1071                                         singleTau.push_back(newTau[j]);
1072                                 }
1073                         }
1074                         
1075                 }
1076         }
1077         catch(exception& e) {
1078                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1079                 exit(1);        
1080         }               
1081 }
1082
1083 /**************************************************************************************************/
1084
1085 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1086     
1087     try{
1088         
1089         int total = 0;
1090         vector<double> newTau(numOTUs,0);
1091         vector<double> norms(numSeqs, 0);
1092         nSeqsPerOTU.assign(numOTUs, 0);
1093         
1094         for(int i=startSeq;i<stopSeq;i++){
1095             
1096             if (m->control_pressed) { break; }
1097             
1098             int indexOffset = i * numOTUs;
1099             
1100             double offset = 1e8;
1101             
1102             for(int j=0;j<numOTUs;j++){
1103                 
1104                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1105                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1106                 }
1107                 
1108                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1109                     offset = dist[indexOffset + j];
1110                 }
1111             }
1112             
1113             for(int j=0;j<numOTUs;j++){
1114                 if(weight[j] > MIN_WEIGHT){
1115                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1116                     norms[i] += newTau[j];
1117                 }
1118                 else{
1119                     newTau[j] = 0.0;
1120                 }
1121             }
1122             
1123             for(int j=0;j<numOTUs;j++){
1124                 newTau[j] /= norms[i];
1125             }
1126             
1127             for(int j=0;j<numOTUs;j++){
1128                 if(newTau[j] > MIN_TAU){
1129                     
1130                     int oldTotal = total;
1131                     
1132                     total++;
1133                     
1134                     singleTau.resize(total, 0);
1135                     seqNumber.resize(total, 0);
1136                     seqIndex.resize(total, 0);
1137                     
1138                     singleTau[oldTotal] = newTau[j];
1139                     
1140                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1141                     aaI[j][nSeqsPerOTU[j]] = i;
1142                     nSeqsPerOTU[j]++;
1143                 }
1144             }
1145             
1146         }
1147         
1148     }
1149     catch(exception& e) {
1150         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1151         exit(1);        
1152     }           
1153 }
1154
1155 /**************************************************************************************************/
1156
1157 void ShhherCommand::setOTUs(){
1158     
1159     try {
1160         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1161         
1162         for(int i=0;i<numOTUs;i++){
1163             
1164             if (m->control_pressed) { break; }
1165             
1166             for(int j=0;j<nSeqsPerOTU[i];j++){
1167                 int index = cumNumSeqs[i] + j;
1168                 double tauValue = singleTau[seqNumber[index]];
1169                 int sIndex = seqIndex[index];
1170                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1171             }
1172         }
1173         
1174         for(int i=0;i<numSeqs;i++){
1175             double maxTau = -1.0000;
1176             int maxOTU = -1;
1177             for(int j=0;j<numOTUs;j++){
1178                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1179                     maxTau = bigTauMatrix[i * numOTUs + j];
1180                     maxOTU = j;
1181                 }
1182             }
1183             
1184             otuData[i] = maxOTU;
1185         }
1186         
1187         nSeqsPerOTU.assign(numOTUs, 0);         
1188         
1189         for(int i=0;i<numSeqs;i++){
1190             int index = otuData[i];
1191             
1192             singleTau[i] = 1.0000;
1193             dist[i] = 0.0000;
1194             
1195             aaP[index][nSeqsPerOTU[index]] = i;
1196             aaI[index][nSeqsPerOTU[index]] = i;
1197             
1198             nSeqsPerOTU[index]++;
1199         }
1200         fill(); 
1201     }
1202     catch(exception& e) {
1203         m->errorOut(e, "ShhherCommand", "setOTUs");
1204         exit(1);        
1205     }           
1206 }
1207
1208 /**************************************************************************************************/
1209  
1210 void ShhherCommand::getUniques(){
1211     try{
1212         
1213         
1214         numUniques = 0;
1215         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1216         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1217         uniqueLengths.assign(numSeqs, 0);
1218         mapSeqToUnique.assign(numSeqs, -1);
1219         mapUniqueToSeq.assign(numSeqs, -1);
1220         
1221         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1222         
1223         for(int i=0;i<numSeqs;i++){
1224             
1225             if (m->control_pressed) { break; }
1226             
1227             int index = 0;
1228             
1229             vector<short> current(numFlowCells);
1230             for(int j=0;j<numFlowCells;j++){
1231                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1232             }
1233             
1234             for(int j=0;j<numUniques;j++){
1235                 int offset = j * numFlowCells;
1236                 bool toEnd = 1;
1237                 
1238                 int shorterLength;
1239                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1240                 else                                                            {       shorterLength = uniqueLengths[j];       }
1241                 
1242                 for(int k=0;k<shorterLength;k++){
1243                     if(current[k] != uniqueFlowgrams[offset + k]){
1244                         toEnd = 0;
1245                         break;
1246                     }
1247                 }
1248                 
1249                 if(toEnd){
1250                     mapSeqToUnique[i] = j;
1251                     uniqueCount[j]++;
1252                     index = j;
1253                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1254                     break;
1255                 }
1256                 index++;
1257             }
1258             
1259             if(index == numUniques){
1260                 uniqueLengths[numUniques] = lengths[i];
1261                 uniqueCount[numUniques] = 1;
1262                 mapSeqToUnique[i] = numUniques;//anMap
1263                 mapUniqueToSeq[numUniques] = i;//anF
1264                 
1265                 for(int k=0;k<numFlowCells;k++){
1266                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1267                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1268                 }
1269                 
1270                 numUniques++;
1271             }
1272         }
1273         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1274         uniqueLengths.resize(numUniques);       
1275         
1276         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1277         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1278     }
1279     catch(exception& e) {
1280         m->errorOut(e, "ShhherCommand", "getUniques");
1281         exit(1);
1282     }
1283 }
1284
1285 /**************************************************************************************************/
1286
1287 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1288     try{
1289         int minLength = lengths[mapSeqToUnique[seqA]];
1290         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1291         
1292         int ANumFlowCells = seqA * numFlowCells;
1293         int BNumFlowCells = seqB * numFlowCells;
1294         
1295         float dist = 0;
1296         
1297         for(int i=0;i<minLength;i++){
1298             
1299             if (m->control_pressed) { break; }
1300             
1301             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1302             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1303             
1304             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1305             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1306             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1307         }
1308         
1309         dist /= (float) minLength;
1310         return dist;
1311     }
1312     catch(exception& e) {
1313         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1314         exit(1);
1315     }
1316 }
1317
1318 //**********************************************************************************************************************/
1319
1320 string ShhherCommand::cluster(string distFileName, string namesFileName){
1321     try {
1322         
1323         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1324         read->setCutoff(cutoff);
1325         
1326         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1327         clusterNameMap->readMap();
1328         read->read(clusterNameMap);
1329         
1330         ListVector* list = read->getListVector();
1331         SparseMatrix* matrix = read->getMatrix();
1332         
1333         delete read; 
1334         delete clusterNameMap; 
1335         
1336         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1337         
1338         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1339         string tag = cluster->getTag();
1340         
1341         double clusterCutoff = cutoff;
1342         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1343             
1344             if (m->control_pressed) { break; }
1345             
1346             cluster->update(clusterCutoff);
1347         }
1348         
1349         list->setLabel(toString(cutoff));
1350         
1351         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1352         ofstream listFile;
1353         m->openOutputFile(listFileName, listFile);
1354         list->print(listFile);
1355         listFile.close();
1356         
1357         delete matrix;  delete cluster; delete rabund; delete list;
1358         
1359         return listFileName;
1360     }
1361     catch(exception& e) {
1362         m->errorOut(e, "ShhherCommand", "cluster");
1363         exit(1);        
1364     }           
1365 }
1366
1367 /**************************************************************************************************/
1368
1369 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1370     
1371     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1372     //within an otu
1373     
1374     try{
1375         
1376         for(int i=start;i<finish;i++){
1377             
1378             if (m->control_pressed) { break; }
1379             
1380             double count = 0;
1381             int position = 0;
1382             int minFlowGram = 100000000;
1383             double minFlowValue = 1e8;
1384             change[i] = 0; //FALSE
1385             
1386             for(int j=0;j<nSeqsPerOTU[i];j++){
1387                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1388             }
1389             
1390             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1391                 vector<double> adF(nSeqsPerOTU[i]);
1392                 vector<int> anL(nSeqsPerOTU[i]);
1393                 
1394                 for(int j=0;j<nSeqsPerOTU[i];j++){
1395                     int index = cumNumSeqs[i] + j;
1396                     int nI = seqIndex[index];
1397                     int nIU = mapSeqToUnique[nI];
1398                     
1399                     int k;
1400                     for(k=0;k<position;k++){
1401                         if(nIU == anL[k]){
1402                             break;
1403                         }
1404                     }
1405                     if(k == position){
1406                         anL[position] = nIU;
1407                         adF[position] = 0.0000;
1408                         position++;
1409                     }                                           
1410                 }
1411                 
1412                 for(int j=0;j<nSeqsPerOTU[i];j++){
1413                     int index = cumNumSeqs[i] + j;
1414                     int nI = seqIndex[index];
1415                     
1416                     double tauValue = singleTau[seqNumber[index]];
1417                     
1418                     for(int k=0;k<position;k++){
1419                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1420                         adF[k] += dist * tauValue;
1421                     }
1422                 }
1423                 
1424                 for(int j=0;j<position;j++){
1425                     if(adF[j] < minFlowValue){
1426                         minFlowGram = j;
1427                         minFlowValue = adF[j];
1428                     }
1429                 }
1430                 
1431                 if(centroids[i] != anL[minFlowGram]){
1432                     change[i] = 1;
1433                     centroids[i] = anL[minFlowGram];
1434                 }
1435             }
1436             else if(centroids[i] != -1){
1437                 change[i] = 1;
1438                 centroids[i] = -1;                      
1439             }
1440         }
1441     }
1442     catch(exception& e) {
1443         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1444         exit(1);        
1445     }           
1446 }
1447
1448 /**************************************************************************************************/
1449
1450 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1451     try{
1452         
1453         int flowAValue = cent * numFlowCells;
1454         int flowBValue = flow * numFlowCells;
1455         
1456         double dist = 0;
1457         
1458         for(int i=0;i<length;i++){
1459             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1460             flowAValue++;
1461             flowBValue++;
1462         }
1463         
1464         return dist / (double)length;
1465     }
1466     catch(exception& e) {
1467         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1468         exit(1);        
1469     }           
1470 }
1471
1472 /**************************************************************************************************/
1473
1474 double ShhherCommand::getNewWeights(){
1475     try{
1476         
1477         double maxChange = 0;
1478         
1479         for(int i=0;i<numOTUs;i++){
1480             
1481             if (m->control_pressed) { break; }
1482             
1483             double difference = weight[i];
1484             weight[i] = 0;
1485             
1486             for(int j=0;j<nSeqsPerOTU[i];j++){
1487                 int index = cumNumSeqs[i] + j;
1488                 double tauValue = singleTau[seqNumber[index]];
1489                 weight[i] += tauValue;
1490             }
1491             
1492             difference = fabs(weight[i] - difference);
1493             if(difference > maxChange){ maxChange = difference; }
1494         }
1495         return maxChange;
1496     }
1497     catch(exception& e) {
1498         m->errorOut(e, "ShhherCommand", "getNewWeights");
1499         exit(1);        
1500     }           
1501 }
1502  
1503  /**************************************************************************************************/
1504  
1505 double ShhherCommand::getLikelihood(){
1506     
1507     try{
1508         
1509         vector<long double> P(numSeqs, 0);
1510         int effNumOTUs = 0;
1511         
1512         for(int i=0;i<numOTUs;i++){
1513             if(weight[i] > MIN_WEIGHT){
1514                 effNumOTUs++;
1515             }
1516         }
1517         
1518         string hold;
1519         for(int i=0;i<numOTUs;i++){
1520             
1521             if (m->control_pressed) { break; }
1522             
1523             for(int j=0;j<nSeqsPerOTU[i];j++){
1524                 int index = cumNumSeqs[i] + j;
1525                 int nI = seqIndex[index];
1526                 double singleDist = dist[seqNumber[index]];
1527                 
1528                 P[nI] += weight[i] * exp(-singleDist * sigma);
1529             }
1530         }
1531         double nLL = 0.00;
1532         for(int i=0;i<numSeqs;i++){
1533             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1534             
1535             nLL += -log(P[i]);
1536         }
1537         
1538         nLL = nLL -(double)numSeqs * log(sigma);
1539         
1540         return nLL; 
1541     }
1542     catch(exception& e) {
1543         m->errorOut(e, "ShhherCommand", "getNewWeights");
1544         exit(1);        
1545     }           
1546 }
1547
1548 /**************************************************************************************************/
1549
1550 void ShhherCommand::checkCentroids(){
1551     try{
1552         vector<int> unique(numOTUs, 1);
1553         
1554         for(int i=0;i<numOTUs;i++){
1555             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1556                 unique[i] = -1;
1557             }
1558         }
1559         
1560         for(int i=0;i<numOTUs;i++){
1561             
1562             if (m->control_pressed) { break; }
1563             
1564             if(unique[i] == 1){
1565                 for(int j=i+1;j<numOTUs;j++){
1566                     if(unique[j] == 1){
1567                         
1568                         if(centroids[j] == centroids[i]){
1569                             unique[j] = 0;
1570                             centroids[j] = -1;
1571                             
1572                             weight[i] += weight[j];
1573                             weight[j] = 0.0;
1574                         }
1575                     }
1576                 }
1577             }
1578         }
1579     }
1580     catch(exception& e) {
1581         m->errorOut(e, "ShhherCommand", "checkCentroids");
1582         exit(1);        
1583     }           
1584 }
1585  /**************************************************************************************************/
1586
1587
1588  
1589 void ShhherCommand::writeQualities(vector<int> otuCounts){
1590     
1591     try {
1592         string thisOutputDir = outputDir;
1593         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1594         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1595         
1596         ofstream qualityFile;
1597         m->openOutputFile(qualityFileName, qualityFile);
1598         
1599         qualityFile.setf(ios::fixed, ios::floatfield);
1600         qualityFile.setf(ios::showpoint);
1601         qualityFile << setprecision(6);
1602         
1603         vector<vector<int> > qualities(numOTUs);
1604         vector<double> pr(HOMOPS, 0);
1605         
1606         
1607         for(int i=0;i<numOTUs;i++){
1608             
1609             if (m->control_pressed) { break; }
1610             
1611             int index = 0;
1612             int base = 0;
1613             
1614             if(nSeqsPerOTU[i] > 0){
1615                 qualities[i].assign(1024, -1);
1616                 
1617                 while(index < numFlowCells){
1618                     double maxPrValue = 1e8;
1619                     short maxPrIndex = -1;
1620                     double count = 0.0000;
1621                     
1622                     pr.assign(HOMOPS, 0);
1623                     
1624                     for(int j=0;j<nSeqsPerOTU[i];j++){
1625                         int lIndex = cumNumSeqs[i] + j;
1626                         double tauValue = singleTau[seqNumber[lIndex]];
1627                         int sequenceIndex = aaI[i][j];
1628                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1629                         
1630                         count += tauValue;
1631                         
1632                         for(int s=0;s<HOMOPS;s++){
1633                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1634                         }
1635                     }
1636                     
1637                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1638                     maxPrValue = pr[maxPrIndex];
1639                     
1640                     if(count > MIN_COUNT){
1641                         double U = 0.0000;
1642                         double norm = 0.0000;
1643                         
1644                         for(int s=0;s<HOMOPS;s++){
1645                             norm += exp(-(pr[s] - maxPrValue));
1646                         }
1647                         
1648                         for(int s=1;s<=maxPrIndex;s++){
1649                             int value = 0;
1650                             double temp = 0.0000;
1651                             
1652                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1653                             
1654                             if(U>0.00){
1655                                 temp = log10(U);
1656                             }
1657                             else{
1658                                 temp = -10.1;
1659                             }
1660                             temp = floor(-10 * temp);
1661                             value = (int)floor(temp);
1662                             if(value > 100){    value = 100;    }
1663                             
1664                             qualities[i][base] = (int)value;
1665                             base++;
1666                         }
1667                     }
1668                     
1669                     index++;
1670                 }
1671             }
1672             
1673             
1674             if(otuCounts[i] > 0){
1675                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1676                 
1677                 int j=4;        //need to get past the first four bases
1678                 while(qualities[i][j] != -1){
1679                     qualityFile << qualities[i][j] << ' ';
1680                     j++;
1681                 }
1682                 qualityFile << endl;
1683             }
1684         }
1685         qualityFile.close();
1686         outputNames.push_back(qualityFileName);
1687         
1688     }
1689     catch(exception& e) {
1690         m->errorOut(e, "ShhherCommand", "writeQualities");
1691         exit(1);        
1692     }           
1693 }
1694
1695 /**************************************************************************************************/
1696
1697 void ShhherCommand::writeSequences(vector<int> otuCounts){
1698     try {
1699         string thisOutputDir = outputDir;
1700         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1701         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1702         ofstream fastaFile;
1703         m->openOutputFile(fastaFileName, fastaFile);
1704         
1705         vector<string> names(numOTUs, "");
1706         
1707         for(int i=0;i<numOTUs;i++){
1708             
1709             if (m->control_pressed) { break; }
1710             
1711             int index = centroids[i];
1712             
1713             if(otuCounts[i] > 0){
1714                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1715                 
1716                 string newSeq = "";
1717                 
1718                 for(int j=0;j<numFlowCells;j++){
1719                     
1720                     char base = flowOrder[j % 4];
1721                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1722                         newSeq += base;
1723                     }
1724                 }
1725                 
1726                 fastaFile << newSeq.substr(4) << endl;
1727             }
1728         }
1729         fastaFile.close();
1730         
1731         outputNames.push_back(fastaFileName);
1732         
1733         if(compositeFASTAFileName != ""){
1734             m->appendFiles(fastaFileName, compositeFASTAFileName);
1735         }
1736     }
1737     catch(exception& e) {
1738         m->errorOut(e, "ShhherCommand", "writeSequences");
1739         exit(1);        
1740     }           
1741 }
1742
1743 /**************************************************************************************************/
1744
1745 void ShhherCommand::writeNames(vector<int> otuCounts){
1746     try {
1747         string thisOutputDir = outputDir;
1748         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1749         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1750         ofstream nameFile;
1751         m->openOutputFile(nameFileName, nameFile);
1752         
1753         for(int i=0;i<numOTUs;i++){
1754             
1755             if (m->control_pressed) { break; }
1756             
1757             if(otuCounts[i] > 0){
1758                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1759                 
1760                 for(int j=1;j<nSeqsPerOTU[i];j++){
1761                     nameFile << ',' << seqNameVector[aaI[i][j]];
1762                 }
1763                 
1764                 nameFile << endl;
1765             }
1766         }
1767         nameFile.close();
1768         outputNames.push_back(nameFileName);
1769         
1770         
1771         if(compositeNamesFileName != ""){
1772             m->appendFiles(nameFileName, compositeNamesFileName);
1773         }               
1774     }
1775     catch(exception& e) {
1776         m->errorOut(e, "ShhherCommand", "writeNames");
1777         exit(1);        
1778     }           
1779 }
1780
1781 /**************************************************************************************************/
1782
1783 void ShhherCommand::writeGroups(){
1784     try {
1785         string thisOutputDir = outputDir;
1786         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1787         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1788         string groupFileName = fileRoot + "shhh.groups";
1789         ofstream groupFile;
1790         m->openOutputFile(groupFileName, groupFile);
1791         
1792         for(int i=0;i<numSeqs;i++){
1793             if (m->control_pressed) { break; }
1794             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1795         }
1796         groupFile.close();
1797         outputNames.push_back(groupFileName);
1798         
1799     }
1800     catch(exception& e) {
1801         m->errorOut(e, "ShhherCommand", "writeGroups");
1802         exit(1);        
1803     }           
1804 }
1805
1806 /**************************************************************************************************/
1807
1808 void ShhherCommand::writeClusters(vector<int> otuCounts){
1809     try {
1810         string thisOutputDir = outputDir;
1811         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1812         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1813         ofstream otuCountsFile;
1814         m->openOutputFile(otuCountsFileName, otuCountsFile);
1815         
1816         string bases = flowOrder;
1817         
1818         for(int i=0;i<numOTUs;i++){
1819             
1820             if (m->control_pressed) {
1821                 break;
1822             }
1823             //output the translated version of the centroid sequence for the otu
1824             if(otuCounts[i] > 0){
1825                 int index = centroids[i];
1826                 
1827                 otuCountsFile << "ideal\t";
1828                 for(int j=8;j<numFlowCells;j++){
1829                     char base = bases[j % 4];
1830                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1831                         otuCountsFile << base;
1832                     }
1833                 }
1834                 otuCountsFile << endl;
1835                 
1836                 for(int j=0;j<nSeqsPerOTU[i];j++){
1837                     int sequence = aaI[i][j];
1838                     otuCountsFile << seqNameVector[sequence] << '\t';
1839                     
1840                     string newSeq = "";
1841                     
1842                     for(int k=0;k<lengths[sequence];k++){
1843                         char base = bases[k % 4];
1844                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1845                         
1846                         for(int s=0;s<freq;s++){
1847                             newSeq += base;
1848                             //otuCountsFile << base;
1849                         }
1850                     }
1851                     otuCountsFile << newSeq.substr(4) << endl;
1852                 }
1853                 otuCountsFile << endl;
1854             }
1855         }
1856         otuCountsFile.close();
1857         outputNames.push_back(otuCountsFileName);
1858         
1859     }
1860     catch(exception& e) {
1861         m->errorOut(e, "ShhherCommand", "writeClusters");
1862         exit(1);        
1863     }           
1864 }
1865
1866 #else
1867 //**********************************************************************************************************************
1868
1869 int ShhherCommand::execute(){
1870         try {
1871                 if (abort == true) { return 0; }
1872                 
1873                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1874                 getJointLookUp();       if (m->control_pressed) { return 0; }
1875                 
1876         int numFiles = flowFileVector.size();
1877                 
1878         if (numFiles < processors) { processors = numFiles; }
1879         
1880 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1881         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1882         else { createProcesses(flowFileVector); } //each processor processes one file
1883 #else
1884         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1885 #endif
1886         
1887                 if(compositeFASTAFileName != ""){
1888                         outputNames.push_back(compositeFASTAFileName);
1889                         outputNames.push_back(compositeNamesFileName);
1890                 }
1891
1892                 m->mothurOutEndLine();
1893                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1894                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1895                 m->mothurOutEndLine();
1896                 
1897                 return 0;
1898         }
1899         catch(exception& e) {
1900                 m->errorOut(e, "ShhherCommand", "execute");
1901                 exit(1);
1902         }
1903 }
1904 #endif
1905 /**************************************************************************************************/
1906
1907 int ShhherCommand::createProcesses(vector<string> filenames){
1908     try {
1909         vector<int> processIDS;
1910                 int process = 1;
1911                 int num = 0;
1912                 
1913                 //sanity check
1914                 if (filenames.size() < processors) { processors = filenames.size(); }
1915                 
1916                 //divide the groups between the processors
1917                 vector<linePair> lines;
1918                 int numFilesPerProcessor = filenames.size() / processors;
1919                 for (int i = 0; i < processors; i++) {
1920                         int startIndex =  i * numFilesPerProcessor;
1921                         int endIndex = (i+1) * numFilesPerProcessor;
1922                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1923                         lines.push_back(linePair(startIndex, endIndex));
1924                 }
1925                 
1926         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1927                 
1928                 //loop through and create all the processes you want
1929                 while (process != processors) {
1930                         int pid = fork();
1931                         
1932                         if (pid > 0) {
1933                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1934                                 process++;
1935                         }else if (pid == 0){
1936                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1937                                 exit(0);
1938                         }else { 
1939                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1940                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1941                                 exit(0);
1942                         }
1943                 }
1944                 
1945                 //do my part
1946                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1947                 
1948                 //force parent to wait until all the processes are done
1949                 for (int i=0;i<processIDS.size();i++) { 
1950                         int temp = processIDS[i];
1951                         wait(&temp);
1952                 }
1953         
1954         #else
1955         
1956         //////////////////////////////////////////////////////////////////////////////////////////////////////
1957         
1958         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1959         
1960         //////////////////////////////////////////////////////////////////////////////////////////////////////
1961                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1962                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1963                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1964                 
1965                 vector<shhhFlowsData*> pDataArray; 
1966                 DWORD   dwThreadIdArray[processors-1];
1967                 HANDLE  hThreadArray[processors-1]; 
1968                 
1969                 //Create processor worker threads.
1970                 for( int i=0; i<processors-1; i++ ){
1971                         // Allocate memory for thread data.
1972                         string extension = "";
1973                         if (i != 0) { extension = toString(i) + ".temp"; }
1974                         
1975             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1976                         pDataArray.push_back(tempFlow);
1977                         processIDS.push_back(i);
1978             
1979                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1980                 }
1981                 
1982                 //using the main process as a worker saves time and memory
1983                 //do my part
1984                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1985                 
1986                 //Wait until all threads have terminated.
1987                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1988                 
1989                 //Close all thread handles and free memory allocations.
1990                 for(int i=0; i < pDataArray.size(); i++){
1991                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1992                         CloseHandle(hThreadArray[i]);
1993                         delete pDataArray[i];
1994                 }
1995                 
1996         #endif
1997         
1998         for (int i=0;i<processIDS.size();i++) { 
1999             if (compositeFASTAFileName != "") {
2000                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2001                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2002                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2003                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2004             }
2005         }
2006         
2007         return 0;
2008         
2009     }
2010         catch(exception& e) {
2011                 m->errorOut(e, "ShhherCommand", "createProcesses");
2012                 exit(1);
2013         }
2014 }
2015 /**************************************************************************************************/
2016
2017 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2018     try {
2019         
2020         for(int i=start;i<end;i++){
2021                         
2022                         if (m->control_pressed) { break; }
2023                         
2024                         string flowFileName = filenames[i];
2025             
2026                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2027                         m->mothurOut("Reading flowgrams...\n");
2028                         
2029             vector<string> seqNameVector;
2030             vector<int> lengths;
2031             vector<short> flowDataIntI;
2032             vector<double> flowDataPrI;
2033             map<string, int> nameMap;
2034             vector<short> uniqueFlowgrams;
2035             vector<int> uniqueCount;
2036             vector<int> mapSeqToUnique;
2037             vector<int> mapUniqueToSeq;
2038             vector<int> uniqueLengths;
2039             int numFlowCells;
2040             
2041             int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2042                         
2043                         if (m->control_pressed) { break; }
2044                         
2045                         m->mothurOut("Identifying unique flowgrams...\n");
2046                         int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2047                         
2048                         if (m->control_pressed) { break; }
2049                         
2050                         m->mothurOut("Calculating distances between flowgrams...\n");
2051             string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2052             unsigned long long begTime = time(NULL);
2053             double begClock = clock();
2054         
2055             flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);     
2056             
2057             m->mothurOutEndLine();
2058             m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2059
2060             
2061                         string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2062                         createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2063                         
2064                         if (m->control_pressed) { break; }
2065                         
2066                         m->mothurOut("\nClustering flowgrams...\n");
2067             string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2068                         cluster(listFileName, distFileName, namesFileName);
2069                         
2070                         if (m->control_pressed) { break; }
2071             
2072             vector<int> otuData;
2073             vector<int> cumNumSeqs;
2074             vector<int> nSeqsPerOTU;
2075             vector<vector<int> > aaP;   //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2076             vector<vector<int> > aaI;   //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2077             vector<int> seqNumber;              //tMaster->anP:         the sequence id number sorted by OTU
2078             vector<int> seqIndex;               //tMaster->anI;         the index that corresponds to seqNumber
2079
2080                         
2081                         int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2082                         
2083                         if (m->control_pressed) { break; }
2084                         
2085                         m->mothurRemove(distFileName);
2086                         m->mothurRemove(namesFileName);
2087                         m->mothurRemove(listFileName);
2088                         
2089             vector<double> dist;                //adDist - distance of sequences to centroids
2090             vector<short> change;               //did the centroid sequence change? 0 = no; 1 = yes
2091             vector<int> centroids;              //the representative flowgram for each cluster m
2092             vector<double> weight;
2093             vector<double> singleTau;   //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2094             vector<int> nSeqsBreaks;
2095             vector<int> nOTUsBreaks;
2096             
2097                         dist.assign(numSeqs * numOTUs, 0);
2098             change.assign(numOTUs, 1);
2099             centroids.assign(numOTUs, -1);
2100             weight.assign(numOTUs, 0);
2101             singleTau.assign(numSeqs, 1.0);
2102             
2103             nSeqsBreaks.assign(2, 0);
2104             nOTUsBreaks.assign(2, 0);
2105             
2106             nSeqsBreaks[0] = 0;
2107             nSeqsBreaks[1] = numSeqs;
2108             nOTUsBreaks[1] = numOTUs;
2109                         
2110                         if (m->control_pressed) { break; }
2111                         
2112                         double maxDelta = 0;
2113                         int iter = 0;
2114                         
2115                         begClock = clock();
2116                         begTime = time(NULL);
2117             
2118                         m->mothurOut("\nDenoising flowgrams...\n");
2119                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2120                         
2121                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2122                                 
2123                                 if (m->control_pressed) { break; }
2124                                 
2125                                 double cycClock = clock();
2126                                 unsigned long long cycTime = time(NULL);
2127                                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2128                                 
2129                                 if (m->control_pressed) { break; }
2130                 
2131                                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2132                                 
2133                                 if (m->control_pressed) { break; }
2134                 
2135                                 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2136                 
2137                 if (m->control_pressed) { break; }
2138                 
2139                                 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2140                 
2141                 if (m->control_pressed) { break; }
2142                 
2143                                 checkCentroids(numOTUs, centroids, weight);
2144                                 
2145                                 if (m->control_pressed) { break; }
2146                                 
2147                                 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2148                                 
2149                                 if (m->control_pressed) { break; }
2150                                 
2151                                 iter++;
2152                                 
2153                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2154                 
2155                         }       
2156                         
2157                         if (m->control_pressed) { break; }
2158                         
2159                         m->mothurOut("\nFinalizing...\n");
2160                         fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2161                         
2162                         if (m->control_pressed) { break; }
2163                         
2164                         setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2165                         
2166                         if (m->control_pressed) { break; }
2167                         
2168                         vector<int> otuCounts(numOTUs, 0);
2169                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2170                         
2171                         calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2172             
2173             if (m->control_pressed) { break; }
2174             
2175                         writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2176                         writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2177                         writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2178                         writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                       if (m->control_pressed) { break; }
2179                         writeGroups(flowFileName, numSeqs, seqNameVector);                                              if (m->control_pressed) { break; }
2180                         
2181                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2182                 }
2183                 
2184         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2185         
2186         return 0;
2187         
2188     }catch(exception& e) {
2189             m->errorOut(e, "ShhherCommand", "driver");
2190             exit(1);
2191     }
2192 }
2193
2194 /**************************************************************************************************/
2195 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2196         try{
2197        
2198                 ifstream flowFile;
2199        
2200                 m->openInputFile(filename, flowFile);
2201                 
2202                 string seqName;
2203                 int currentNumFlowCells;
2204                 float intensity;
2205         thisSeqNameVector.clear();
2206                 thisLengths.clear();
2207                 thisFlowDataIntI.clear();
2208                 thisNameMap.clear();
2209                 
2210                 flowFile >> numFlowCells;
2211                 int index = 0;//pcluster
2212                 while(!flowFile.eof()){
2213                         
2214                         if (m->control_pressed) { break; }
2215                         
2216                         flowFile >> seqName >> currentNumFlowCells;
2217                         thisLengths.push_back(currentNumFlowCells);
2218            
2219                         thisSeqNameVector.push_back(seqName);
2220                         thisNameMap[seqName] = index++;//pcluster
2221
2222                         for(int i=0;i<numFlowCells;i++){
2223                                 flowFile >> intensity;
2224                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2225                                 int intI = int(100 * intensity + 0.0001);
2226                                 thisFlowDataIntI.push_back(intI);
2227                         }
2228                         m->gobble(flowFile);
2229                 }
2230                 flowFile.close();
2231                 
2232                 int numSeqs = thisSeqNameVector.size();         
2233                 
2234                 for(int i=0;i<numSeqs;i++){
2235                         
2236                         if (m->control_pressed) { break; }
2237                         
2238                         int iNumFlowCells = i * numFlowCells;
2239                         for(int j=thisLengths[i];j<numFlowCells;j++){
2240                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2241                         }
2242                 }
2243         
2244         return numSeqs;
2245                 
2246         }
2247         catch(exception& e) {
2248                 m->errorOut(e, "ShhherCommand", "getFlowData");
2249                 exit(1);
2250         }
2251 }
2252 /**************************************************************************************************/
2253
2254 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2255         try{            
2256         
2257                 ostringstream outStream;
2258                 outStream.setf(ios::fixed, ios::floatfield);
2259                 outStream.setf(ios::dec, ios::basefield);
2260                 outStream.setf(ios::showpoint);
2261                 outStream.precision(6);
2262                 
2263                 int begTime = time(NULL);
2264                 double begClock = clock();
2265         
2266                 for(int i=0;i<stopSeq;i++){
2267                         
2268                         if (m->control_pressed) { break; }
2269                         
2270                         for(int j=0;j<i;j++){
2271                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2272                 
2273                                 if(flowDistance < 1e-6){
2274                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2275                                 }
2276                                 else if(flowDistance <= cutoff){
2277                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2278                                 }
2279                         }
2280                         if(i % 100 == 0){
2281                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2282                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2283                                 m->mothurOutEndLine();
2284                         }
2285                 }
2286                 
2287                 ofstream distFile(distFileName.c_str());
2288                 distFile << outStream.str();            
2289                 distFile.close();
2290                 
2291                 if (m->control_pressed) {}
2292                 else {
2293                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2294                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2295                         m->mothurOutEndLine();
2296                 }
2297         
2298         return 0;
2299         }
2300         catch(exception& e) {
2301                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2302                 exit(1);
2303         }
2304 }
2305 /**************************************************************************************************/
2306
2307 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2308         try{
2309                 int minLength = lengths[mapSeqToUnique[seqA]];
2310                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2311                 
2312                 int ANumFlowCells = seqA * numFlowCells;
2313                 int BNumFlowCells = seqB * numFlowCells;
2314                 
2315                 float dist = 0;
2316                 
2317                 for(int i=0;i<minLength;i++){
2318                         
2319                         if (m->control_pressed) { break; }
2320                         
2321                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2322                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2323                         
2324                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2325                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2326                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2327                 }
2328                 
2329                 dist /= (float) minLength;
2330                 return dist;
2331         }
2332         catch(exception& e) {
2333                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2334                 exit(1);
2335         }
2336 }
2337
2338 /**************************************************************************************************/
2339
2340 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2341         try{
2342                 int numUniques = 0;
2343                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2344                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2345                 uniqueLengths.assign(numSeqs, 0);
2346                 mapSeqToUnique.assign(numSeqs, -1);
2347                 mapUniqueToSeq.assign(numSeqs, -1);
2348                 
2349                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2350                 
2351                 for(int i=0;i<numSeqs;i++){
2352                         
2353                         if (m->control_pressed) { break; }
2354                         
2355                         int index = 0;
2356                         
2357                         vector<short> current(numFlowCells);
2358                         for(int j=0;j<numFlowCells;j++){
2359                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2360                         }
2361             
2362                         for(int j=0;j<numUniques;j++){
2363                                 int offset = j * numFlowCells;
2364                                 bool toEnd = 1;
2365                                 
2366                                 int shorterLength;
2367                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2368                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2369                 
2370                                 for(int k=0;k<shorterLength;k++){
2371                                         if(current[k] != uniqueFlowgrams[offset + k]){
2372                                                 toEnd = 0;
2373                                                 break;
2374                                         }
2375                                 }
2376                                 
2377                                 if(toEnd){
2378                                         mapSeqToUnique[i] = j;
2379                                         uniqueCount[j]++;
2380                                         index = j;
2381                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2382                                         break;
2383                                 }
2384                                 index++;
2385                         }
2386                         
2387                         if(index == numUniques){
2388                                 uniqueLengths[numUniques] = lengths[i];
2389                                 uniqueCount[numUniques] = 1;
2390                                 mapSeqToUnique[i] = numUniques;//anMap
2391                                 mapUniqueToSeq[numUniques] = i;//anF
2392                                 
2393                                 for(int k=0;k<numFlowCells;k++){
2394                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2395                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2396                                 }
2397                                 
2398                                 numUniques++;
2399                         }
2400                 }
2401                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2402                 uniqueLengths.resize(numUniques);       
2403                 
2404                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2405                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2406         
2407         return numUniques;
2408         }
2409         catch(exception& e) {
2410                 m->errorOut(e, "ShhherCommand", "getUniques");
2411                 exit(1);
2412         }
2413 }
2414 /**************************************************************************************************/
2415 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2416         try{
2417                 
2418                 vector<string> duplicateNames(numUniques, "");
2419                 for(int i=0;i<numSeqs;i++){
2420                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2421                 }
2422                 
2423                 ofstream nameFile;
2424                 m->openOutputFile(filename, nameFile);
2425                 
2426                 for(int i=0;i<numUniques;i++){
2427                         
2428                         if (m->control_pressed) { break; }
2429                         
2430             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2431                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2432                 }
2433                 
2434                 nameFile.close();
2435         
2436                 return 0;
2437         }
2438         catch(exception& e) {
2439                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2440                 exit(1);
2441         }
2442 }
2443 //**********************************************************************************************************************
2444
2445 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2446         try {
2447                 
2448                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2449                 read->setCutoff(cutoff);
2450                 
2451                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2452                 clusterNameMap->readMap();
2453                 read->read(clusterNameMap);
2454         
2455                 ListVector* list = read->getListVector();
2456                 SparseMatrix* matrix = read->getMatrix();
2457                 
2458                 delete read; 
2459                 delete clusterNameMap; 
2460         
2461                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2462                 
2463                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2464                 string tag = cluster->getTag();
2465                 
2466                 double clusterCutoff = cutoff;
2467                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2468                         
2469                         if (m->control_pressed) { break; }
2470                         
2471                         cluster->update(clusterCutoff);
2472                 }
2473                 
2474                 list->setLabel(toString(cutoff));
2475                 
2476                 ofstream listFile;
2477                 m->openOutputFile(filename, listFile);
2478                 list->print(listFile);
2479                 listFile.close();
2480                 
2481                 delete matrix;  delete cluster; delete rabund; delete list;
2482         
2483                 return 0;
2484         }
2485         catch(exception& e) {
2486                 m->errorOut(e, "ShhherCommand", "cluster");
2487                 exit(1);        
2488         }               
2489 }
2490 /**************************************************************************************************/
2491
2492 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2493                                vector<int>& cumNumSeqs,
2494                                vector<int>& nSeqsPerOTU,
2495                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2496                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2497                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2498                                vector<int>& seqIndex,
2499                                map<string, int>& nameMap){
2500         try {
2501         
2502                 ifstream listFile;
2503                 m->openInputFile(fileName, listFile);
2504                 string label;
2505         int numOTUs;
2506                 
2507                 listFile >> label >> numOTUs;
2508         
2509                 otuData.assign(numSeqs, 0);
2510                 cumNumSeqs.assign(numOTUs, 0);
2511                 nSeqsPerOTU.assign(numOTUs, 0);
2512                 aaP.clear();aaP.resize(numOTUs);
2513                 
2514                 seqNumber.clear();
2515                 aaI.clear();
2516                 seqIndex.clear();
2517                 
2518                 string singleOTU = "";
2519                 
2520                 for(int i=0;i<numOTUs;i++){
2521                         
2522                         if (m->control_pressed) { break; }
2523             
2524                         listFile >> singleOTU;
2525                         
2526                         istringstream otuString(singleOTU);
2527             
2528                         while(otuString){
2529                                 
2530                                 string seqName = "";
2531                                 
2532                                 for(int j=0;j<singleOTU.length();j++){
2533                                         char letter = otuString.get();
2534                                         
2535                                         if(letter != ','){
2536                                                 seqName += letter;
2537                                         }
2538                                         else{
2539                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2540                                                 int index = nmIt->second;
2541                                                 
2542                                                 nameMap.erase(nmIt);
2543                                                 
2544                                                 otuData[index] = i;
2545                                                 nSeqsPerOTU[i]++;
2546                                                 aaP[i].push_back(index);
2547                                                 seqName = "";
2548                                         }
2549                                 }
2550                                 
2551                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2552                 
2553                                 int index = nmIt->second;
2554                                 nameMap.erase(nmIt);
2555                 
2556                                 otuData[index] = i;
2557                                 nSeqsPerOTU[i]++;
2558                                 aaP[i].push_back(index);        
2559                                 
2560                                 otuString.get();
2561                         }
2562                         
2563                         sort(aaP[i].begin(), aaP[i].end());
2564                         for(int j=0;j<nSeqsPerOTU[i];j++){
2565                                 seqNumber.push_back(aaP[i][j]);
2566                         }
2567                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2568                                 aaP[i].push_back(0);
2569                         }
2570                         
2571                         
2572                 }
2573                 
2574                 for(int i=1;i<numOTUs;i++){
2575                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2576                 }
2577                 aaI = aaP;
2578                 seqIndex = seqNumber;
2579                 
2580                 listFile.close();       
2581         
2582         return numOTUs;
2583                 
2584         }
2585         catch(exception& e) {
2586                 m->errorOut(e, "ShhherCommand", "getOTUData");
2587                 exit(1);        
2588         }               
2589 }
2590 /**************************************************************************************************/
2591
2592 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2593                                           vector<int>& cumNumSeqs,
2594                                           vector<int>& nSeqsPerOTU,
2595                                           vector<int>& seqIndex,
2596                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2597                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2598                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2599                                           vector<int>& mapSeqToUnique,
2600                                           vector<short>& uniqueFlowgrams,
2601                                           vector<short>& flowDataIntI,
2602                                           vector<int>& lengths,
2603                                           int numFlowCells,
2604                                           vector<int>& seqNumber){                          
2605         
2606         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2607         //within an otu
2608         
2609         try{
2610                 
2611                 for(int i=0;i<numOTUs;i++){
2612                         
2613                         if (m->control_pressed) { break; }
2614                         
2615                         double count = 0;
2616                         int position = 0;
2617                         int minFlowGram = 100000000;
2618                         double minFlowValue = 1e8;
2619                         change[i] = 0; //FALSE
2620                         
2621                         for(int j=0;j<nSeqsPerOTU[i];j++){
2622                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2623                         }
2624             
2625                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2626                                 vector<double> adF(nSeqsPerOTU[i]);
2627                                 vector<int> anL(nSeqsPerOTU[i]);
2628                                 
2629                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2630                                         int index = cumNumSeqs[i] + j;
2631                                         int nI = seqIndex[index];
2632                                         int nIU = mapSeqToUnique[nI];
2633                                         
2634                                         int k;
2635                                         for(k=0;k<position;k++){
2636                                                 if(nIU == anL[k]){
2637                                                         break;
2638                                                 }
2639                                         }
2640                                         if(k == position){
2641                                                 anL[position] = nIU;
2642                                                 adF[position] = 0.0000;
2643                                                 position++;
2644                                         }                                               
2645                                 }
2646                                 
2647                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2648                                         int index = cumNumSeqs[i] + j;
2649                                         int nI = seqIndex[index];
2650                                         
2651                                         double tauValue = singleTau[seqNumber[index]];
2652                                         
2653                                         for(int k=0;k<position;k++){
2654                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2655                                                 adF[k] += dist * tauValue;
2656                                         }
2657                                 }
2658                                 
2659                                 for(int j=0;j<position;j++){
2660                                         if(adF[j] < minFlowValue){
2661                                                 minFlowGram = j;
2662                                                 minFlowValue = adF[j];
2663                                         }
2664                                 }
2665                                 
2666                                 if(centroids[i] != anL[minFlowGram]){
2667                                         change[i] = 1;
2668                                         centroids[i] = anL[minFlowGram];
2669                                 }
2670                         }
2671                         else if(centroids[i] != -1){
2672                                 change[i] = 1;
2673                                 centroids[i] = -1;                      
2674                         }
2675                 }
2676         
2677         return 0;
2678         }
2679         catch(exception& e) {
2680                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2681                 exit(1);        
2682         }               
2683 }
2684 /**************************************************************************************************/
2685
2686 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2687                                         vector<short>& flowDataIntI, int numFlowCells){
2688         try{
2689                 
2690                 int flowAValue = cent * numFlowCells;
2691                 int flowBValue = flow * numFlowCells;
2692                 
2693                 double dist = 0;
2694         
2695                 for(int i=0;i<length;i++){
2696                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2697                         flowAValue++;
2698                         flowBValue++;
2699                 }
2700                 
2701                 return dist / (double)length;
2702         }
2703         catch(exception& e) {
2704                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2705                 exit(1);        
2706         }               
2707 }
2708 /**************************************************************************************************/
2709
2710 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2711         try{
2712                 
2713                 double maxChange = 0;
2714                 
2715                 for(int i=0;i<numOTUs;i++){
2716                         
2717                         if (m->control_pressed) { break; }
2718                         
2719                         double difference = weight[i];
2720                         weight[i] = 0;
2721                         
2722                         for(int j=0;j<nSeqsPerOTU[i];j++){
2723                                 int index = cumNumSeqs[i] + j;
2724                                 double tauValue = singleTau[seqNumber[index]];
2725                                 weight[i] += tauValue;
2726                         }
2727                         
2728                         difference = fabs(weight[i] - difference);
2729                         if(difference > maxChange){     maxChange = difference; }
2730                 }
2731                 return maxChange;
2732         }
2733         catch(exception& e) {
2734                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2735                 exit(1);        
2736         }               
2737 }
2738
2739 /**************************************************************************************************/
2740
2741 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2742         
2743         try{
2744                 
2745                 vector<long double> P(numSeqs, 0);
2746                 int effNumOTUs = 0;
2747                 
2748                 for(int i=0;i<numOTUs;i++){
2749                         if(weight[i] > MIN_WEIGHT){
2750                                 effNumOTUs++;
2751                         }
2752                 }
2753                 
2754                 string hold;
2755                 for(int i=0;i<numOTUs;i++){
2756                         
2757                         if (m->control_pressed) { break; }
2758                         
2759                         for(int j=0;j<nSeqsPerOTU[i];j++){
2760                                 int index = cumNumSeqs[i] + j;
2761                                 int nI = seqIndex[index];
2762                                 double singleDist = dist[seqNumber[index]];
2763                                 
2764                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2765                         }
2766                 }
2767                 double nLL = 0.00;
2768                 for(int i=0;i<numSeqs;i++){
2769                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2770             
2771                         nLL += -log(P[i]);
2772                 }
2773                 
2774                 nLL = nLL -(double)numSeqs * log(sigma);
2775         
2776                 return nLL; 
2777         }
2778         catch(exception& e) {
2779                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2780                 exit(1);        
2781         }               
2782 }
2783
2784 /**************************************************************************************************/
2785
2786 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2787         try{
2788                 vector<int> unique(numOTUs, 1);
2789                 
2790                 for(int i=0;i<numOTUs;i++){
2791                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2792                                 unique[i] = -1;
2793                         }
2794                 }
2795                 
2796                 for(int i=0;i<numOTUs;i++){
2797                         
2798                         if (m->control_pressed) { break; }
2799                         
2800                         if(unique[i] == 1){
2801                                 for(int j=i+1;j<numOTUs;j++){
2802                                         if(unique[j] == 1){
2803                                                 
2804                                                 if(centroids[j] == centroids[i]){
2805                                                         unique[j] = 0;
2806                                                         centroids[j] = -1;
2807                                                         
2808                                                         weight[i] += weight[j];
2809                                                         weight[j] = 0.0;
2810                                                 }
2811                                         }
2812                                 }
2813                         }
2814                 }
2815         
2816         return 0;
2817         }
2818         catch(exception& e) {
2819                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2820                 exit(1);        
2821         }               
2822 }
2823 /**************************************************************************************************/
2824
2825 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2826                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2827                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2828                                      vector<int>& seqNumber, vector<int>& seqIndex,
2829                                      vector<short>& uniqueFlowgrams,
2830                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2831         
2832         try{
2833                 
2834                 int total = 0;
2835                 vector<double> newTau(numOTUs,0);
2836                 vector<double> norms(numSeqs, 0);
2837                 nSeqsPerOTU.assign(numOTUs, 0);
2838         
2839                 for(int i=0;i<numSeqs;i++){
2840                         
2841                         if (m->control_pressed) { break; }
2842                         
2843                         int indexOffset = i * numOTUs;
2844             
2845                         double offset = 1e8;
2846                         
2847                         for(int j=0;j<numOTUs;j++){
2848                 
2849                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2850                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2851                                 }
2852                 
2853                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2854                                         offset = dist[indexOffset + j];
2855                                 }
2856                         }
2857             
2858                         for(int j=0;j<numOTUs;j++){
2859                                 if(weight[j] > MIN_WEIGHT){
2860                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2861                                         norms[i] += newTau[j];
2862                                 }
2863                                 else{
2864                                         newTau[j] = 0.0;
2865                                 }
2866                         }
2867             
2868                         for(int j=0;j<numOTUs;j++){
2869                                 newTau[j] /= norms[i];
2870                         }
2871             
2872                         for(int j=0;j<numOTUs;j++){
2873                                 if(newTau[j] > MIN_TAU){
2874                                         
2875                                         int oldTotal = total;
2876                                         
2877                                         total++;
2878                                         
2879                                         singleTau.resize(total, 0);
2880                                         seqNumber.resize(total, 0);
2881                                         seqIndex.resize(total, 0);
2882                                         
2883                                         singleTau[oldTotal] = newTau[j];
2884                                         
2885                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2886                                         aaI[j][nSeqsPerOTU[j]] = i;
2887                                         nSeqsPerOTU[j]++;
2888                                 }
2889                         }
2890             
2891                 }
2892         
2893         }
2894         catch(exception& e) {
2895                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2896                 exit(1);        
2897         }               
2898 }
2899 /**************************************************************************************************/
2900
2901 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2902         try {
2903                 int index = 0;
2904                 for(int i=0;i<numOTUs;i++){
2905                         
2906                         if (m->control_pressed) { return 0; }
2907                         
2908                         cumNumSeqs[i] = index;
2909                         for(int j=0;j<nSeqsPerOTU[i];j++){
2910                                 seqNumber[index] = aaP[i][j];
2911                                 seqIndex[index] = aaI[i][j];
2912                                 
2913                                 index++;
2914                         }
2915                 }
2916         
2917         return 0; 
2918         }
2919         catch(exception& e) {
2920                 m->errorOut(e, "ShhherCommand", "fill");
2921                 exit(1);        
2922         }               
2923 }
2924 /**************************************************************************************************/
2925
2926 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2927                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2928         
2929         try {
2930                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2931                 
2932                 for(int i=0;i<numOTUs;i++){
2933                         
2934                         if (m->control_pressed) { break; }
2935                         
2936                         for(int j=0;j<nSeqsPerOTU[i];j++){
2937                                 int index = cumNumSeqs[i] + j;
2938                                 double tauValue = singleTau[seqNumber[index]];
2939                                 int sIndex = seqIndex[index];
2940                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
2941                         }
2942                 }
2943                 
2944                 for(int i=0;i<numSeqs;i++){
2945                         double maxTau = -1.0000;
2946                         int maxOTU = -1;
2947                         for(int j=0;j<numOTUs;j++){
2948                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2949                                         maxTau = bigTauMatrix[i * numOTUs + j];
2950                                         maxOTU = j;
2951                                 }
2952                         }
2953                         
2954                         otuData[i] = maxOTU;
2955                 }
2956                 
2957                 nSeqsPerOTU.assign(numOTUs, 0);         
2958                 
2959                 for(int i=0;i<numSeqs;i++){
2960                         int index = otuData[i];
2961                         
2962                         singleTau[i] = 1.0000;
2963                         dist[i] = 0.0000;
2964                         
2965                         aaP[index][nSeqsPerOTU[index]] = i;
2966                         aaI[index][nSeqsPerOTU[index]] = i;
2967                         
2968                         nSeqsPerOTU[index]++;
2969                 }
2970         
2971                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
2972         }
2973         catch(exception& e) {
2974                 m->errorOut(e, "ShhherCommand", "setOTUs");
2975                 exit(1);        
2976         }               
2977 }
2978 /**************************************************************************************************/
2979
2980 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2981                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2982                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2983         
2984         try {
2985                 string thisOutputDir = outputDir;
2986                 if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
2987                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2988         
2989                 ofstream qualityFile;
2990                 m->openOutputFile(qualityFileName, qualityFile);
2991         
2992                 qualityFile.setf(ios::fixed, ios::floatfield);
2993                 qualityFile.setf(ios::showpoint);
2994                 qualityFile << setprecision(6);
2995                 
2996                 vector<vector<int> > qualities(numOTUs);
2997                 vector<double> pr(HOMOPS, 0);
2998                 
2999                 
3000                 for(int i=0;i<numOTUs;i++){
3001                         
3002                         if (m->control_pressed) { break; }
3003                         
3004                         int index = 0;
3005                         int base = 0;
3006                         
3007                         if(nSeqsPerOTU[i] > 0){
3008                                 qualities[i].assign(1024, -1);
3009                                 
3010                                 while(index < numFlowCells){
3011                                         double maxPrValue = 1e8;
3012                                         short maxPrIndex = -1;
3013                                         double count = 0.0000;
3014                                         
3015                                         pr.assign(HOMOPS, 0);
3016                                         
3017                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3018                                                 int lIndex = cumNumSeqs[i] + j;
3019                                                 double tauValue = singleTau[seqNumber[lIndex]];
3020                                                 int sequenceIndex = aaI[i][j];
3021                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3022                                                 
3023                                                 count += tauValue;
3024                                                 
3025                                                 for(int s=0;s<HOMOPS;s++){
3026                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3027                                                 }
3028                                         }
3029                                         
3030                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3031                                         maxPrValue = pr[maxPrIndex];
3032                                         
3033                                         if(count > MIN_COUNT){
3034                                                 double U = 0.0000;
3035                                                 double norm = 0.0000;
3036                                                 
3037                                                 for(int s=0;s<HOMOPS;s++){
3038                                                         norm += exp(-(pr[s] - maxPrValue));
3039                                                 }
3040                                                 
3041                                                 for(int s=1;s<=maxPrIndex;s++){
3042                                                         int value = 0;
3043                                                         double temp = 0.0000;
3044                                                         
3045                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3046                                                         
3047                                                         if(U>0.00){
3048                                                                 temp = log10(U);
3049                                                         }
3050                                                         else{
3051                                                                 temp = -10.1;
3052                                                         }
3053                                                         temp = floor(-10 * temp);
3054                                                         value = (int)floor(temp);
3055                                                         if(value > 100){        value = 100;    }
3056                                                         
3057                                                         qualities[i][base] = (int)value;
3058                                                         base++;
3059                                                 }
3060                                         }
3061                                         
3062                                         index++;
3063                                 }
3064                         }
3065                         
3066                         
3067                         if(otuCounts[i] > 0){
3068                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3069                         
3070                                 int j=4;        //need to get past the first four bases
3071                                 while(qualities[i][j] != -1){
3072                     qualityFile << qualities[i][j] << ' ';
3073                     if (j > qualities[i].size()) { break; }
3074                     j++;
3075                                 }
3076                                 qualityFile << endl;
3077                         }
3078                 }
3079                 qualityFile.close();
3080                 outputNames.push_back(qualityFileName);
3081         
3082         }
3083         catch(exception& e) {
3084                 m->errorOut(e, "ShhherCommand", "writeQualities");
3085                 exit(1);        
3086         }               
3087 }
3088
3089 /**************************************************************************************************/
3090
3091 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3092         try {
3093                 string thisOutputDir = outputDir;
3094                 if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
3095                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3096                 ofstream fastaFile;
3097                 m->openOutputFile(fastaFileName, fastaFile);
3098                 
3099                 vector<string> names(numOTUs, "");
3100                 
3101                 for(int i=0;i<numOTUs;i++){
3102                         
3103                         if (m->control_pressed) { break; }
3104                         
3105                         int index = centroids[i];
3106                         
3107                         if(otuCounts[i] > 0){
3108                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3109                                 
3110                                 string newSeq = "";
3111                                 
3112                                 for(int j=0;j<numFlowCells;j++){
3113                                         
3114                                         char base = flowOrder[j % 4];
3115                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3116                                                 newSeq += base;
3117                                         }
3118                                 }
3119                                 
3120                                 fastaFile << newSeq.substr(4) << endl;
3121                         }
3122                 }
3123                 fastaFile.close();
3124         
3125                 outputNames.push_back(fastaFileName);
3126         
3127                 if(thisCompositeFASTAFileName != ""){
3128                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3129                 }
3130         }
3131         catch(exception& e) {
3132                 m->errorOut(e, "ShhherCommand", "writeSequences");
3133                 exit(1);        
3134         }               
3135 }
3136
3137 /**************************************************************************************************/
3138
3139 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3140         try {
3141                 string thisOutputDir = outputDir;
3142                 if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
3143                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3144                 ofstream nameFile;
3145                 m->openOutputFile(nameFileName, nameFile);
3146                 
3147                 for(int i=0;i<numOTUs;i++){
3148                         
3149                         if (m->control_pressed) { break; }
3150                         
3151                         if(otuCounts[i] > 0){
3152                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3153                                 
3154                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3155                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3156                                 }
3157                                 
3158                                 nameFile << endl;
3159                         }
3160                 }
3161                 nameFile.close();
3162                 outputNames.push_back(nameFileName);
3163                 
3164                 
3165                 if(thisCompositeNamesFileName != ""){
3166                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3167                 }               
3168         }
3169         catch(exception& e) {
3170                 m->errorOut(e, "ShhherCommand", "writeNames");
3171                 exit(1);        
3172         }               
3173 }
3174
3175 /**************************************************************************************************/
3176
3177 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3178         try {
3179                 string thisOutputDir = outputDir;
3180                 if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
3181                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3182                 string groupFileName = fileRoot + "shhh.groups";
3183                 ofstream groupFile;
3184                 m->openOutputFile(groupFileName, groupFile);
3185                 
3186                 for(int i=0;i<numSeqs;i++){
3187                         if (m->control_pressed) { break; }
3188                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3189                 }
3190                 groupFile.close();
3191                 outputNames.push_back(groupFileName);
3192         
3193         }
3194         catch(exception& e) {
3195                 m->errorOut(e, "ShhherCommand", "writeGroups");
3196                 exit(1);        
3197         }               
3198 }
3199
3200 /**************************************************************************************************/
3201
3202 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3203         try {
3204                 string thisOutputDir = outputDir;
3205                 if (outputDir == "") {  thisOutputDir = m->hasPath(filename);  }
3206                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3207                 ofstream otuCountsFile;
3208                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3209                 
3210                 string bases = flowOrder;
3211                 
3212                 for(int i=0;i<numOTUs;i++){
3213                         
3214                         if (m->control_pressed) {
3215                                 break;
3216                         }
3217                         //output the translated version of the centroid sequence for the otu
3218                         if(otuCounts[i] > 0){
3219                                 int index = centroids[i];
3220                                 
3221                                 otuCountsFile << "ideal\t";
3222                                 for(int j=8;j<numFlowCells;j++){
3223                                         char base = bases[j % 4];
3224                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3225                                                 otuCountsFile << base;
3226                                         }
3227                                 }
3228                                 otuCountsFile << endl;
3229                                 
3230                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3231                                         int sequence = aaI[i][j];
3232                                         otuCountsFile << seqNameVector[sequence] << '\t';
3233                                         
3234                                         string newSeq = "";
3235                                         
3236                                         for(int k=0;k<lengths[sequence];k++){
3237                                                 char base = bases[k % 4];
3238                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3239                         
3240                                                 for(int s=0;s<freq;s++){
3241                                                         newSeq += base;
3242                                                         //otuCountsFile << base;
3243                                                 }
3244                                         }
3245                                         otuCountsFile << newSeq.substr(4) << endl;
3246                                 }
3247                                 otuCountsFile << endl;
3248                         }
3249                 }
3250                 otuCountsFile.close();
3251                 outputNames.push_back(otuCountsFileName);
3252         
3253         }
3254         catch(exception& e) {
3255                 m->errorOut(e, "ShhherCommand", "writeClusters");
3256                 exit(1);        
3257         }               
3258 }
3259
3260 /**************************************************************************************************/
3261
3262 void ShhherCommand::getSingleLookUp(){
3263         try{
3264                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3265                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3266                 
3267                 int index = 0;
3268                 ifstream lookUpFile;
3269                 m->openInputFile(lookupFileName, lookUpFile);
3270                 
3271                 for(int i=0;i<HOMOPS;i++){
3272                         
3273                         if (m->control_pressed) { break; }
3274                         
3275                         float logFracFreq;
3276                         lookUpFile >> logFracFreq;
3277                         
3278                         for(int j=0;j<NUMBINS;j++)      {
3279                                 lookUpFile >> singleLookUp[index];
3280                                 index++;
3281                         }
3282                 }       
3283                 lookUpFile.close();
3284         }
3285         catch(exception& e) {
3286                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3287                 exit(1);
3288         }
3289 }
3290
3291 /**************************************************************************************************/
3292
3293 void ShhherCommand::getJointLookUp(){
3294         try{
3295                 
3296                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3297                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3298                 
3299                 for(int i=0;i<NUMBINS;i++){
3300                         
3301                         if (m->control_pressed) { break; }
3302                         
3303                         for(int j=0;j<NUMBINS;j++){             
3304                                 
3305                                 double minSum = 100000000;
3306                                 
3307                                 for(int k=0;k<HOMOPS;k++){
3308                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3309                                         
3310                                         if(sum < minSum)        {       minSum = sum;           }
3311                                 }       
3312                                 jointLookUp[i * NUMBINS + j] = minSum;
3313                         }
3314                 }
3315         }
3316         catch(exception& e) {
3317                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3318                 exit(1);
3319         }
3320 }
3321
3322 /**************************************************************************************************/
3323
3324 double ShhherCommand::getProbIntensity(int intIntensity){                          
3325         try{
3326
3327                 double minNegLogProb = 100000000; 
3328
3329                 
3330                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3331                         
3332                         if (m->control_pressed) { break; }
3333                         
3334                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3335                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3336                 }
3337                 
3338                 return minNegLogProb;
3339         }
3340         catch(exception& e) {
3341                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3342                 exit(1);
3343         }
3344 }
3345
3346
3347
3348