]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added sparseDistanceMatrix class. Modified cluster commands to use the new sparse...
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 string ShhherCommand::getOutputFileNameTag(string type, string inputName=""){   
51         try {
52         string outputFileName = "";
53                 map<string, vector<string> >::iterator it;
54         
55         //is this a type this command creates
56         it = outputTypes.find(type);
57         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
58         else {
59             if (type == "fasta")            {   outputFileName =  "shhh.fasta";   }
60             else if (type == "name")    {   outputFileName =  "shhh.names";   }
61             else if (type == "group")        {   outputFileName =  "shhh.groups";   }
62             else if (type == "counts")        {   outputFileName =  "shhh.counts";   }
63             else if (type == "qfile")        {   outputFileName =  "shhh.qual";   }
64             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
65         }
66         return outputFileName;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "ShhherCommand", "getOutputFileNameTag");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74
75 ShhherCommand::ShhherCommand(){ 
76         try {
77                 abort = true; calledHelp = true;
78                 setParameters();
79                 
80                 //initialize outputTypes
81                 vector<string> tempOutNames;
82                 outputTypes["fasta"] = tempOutNames;
83         outputTypes["name"] = tempOutNames;
84         outputTypes["group"] = tempOutNames;
85         outputTypes["counts"] = tempOutNames;
86         outputTypes["qfile"] = tempOutNames;
87
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
91                 exit(1);
92         }
93 }
94
95 //**********************************************************************************************************************
96
97 ShhherCommand::ShhherCommand(string option) {
98         try {
99         
100 #ifdef USE_MPI
101                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
102                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
103         
104                 if(pid == 0){
105 #endif
106                 abort = false; calledHelp = false;   
107                 
108                 //allow user to run help
109                 if(option == "help") { help(); abort = true; calledHelp = true; }
110                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111                 
112                 else {
113                         vector<string> myArray = setParameters();
114                         
115                         OptionParser parser(option);
116                         map<string,string> parameters = parser.getParameters();
117                         
118                         ValidParameters validParameter;
119                         map<string,string>::iterator it;
120                         
121                         //check to make sure all parameters are valid for command
122                         for (it = parameters.begin(); it != parameters.end(); it++) { 
123                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
124                         }
125                         
126                         //initialize outputTypes
127             vector<string> tempOutNames;
128             outputTypes["fasta"] = tempOutNames;
129             outputTypes["name"] = tempOutNames;
130             outputTypes["group"] = tempOutNames;
131             outputTypes["counts"] = tempOutNames;
132             outputTypes["qfile"] = tempOutNames;
133
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("flow");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
146                                 }
147                                 
148                                 it = parameters.find("lookup");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
154                                 }
155
156                                 it = parameters.find("file");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
162                                 }
163                         }
164             
165             //if the user changes the output directory command factory will send this info to us in the output parameter 
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
167             
168                         //check for required parameters
169                         flowFileName = validParameter.validFile(parameters, "flow", true);
170                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
171                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
172                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
173                                 m->mothurOutEndLine();
174                                 abort = true; 
175                         }
176                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
177                         
178                         if(flowFileName != "not found"){
179                                 compositeFASTAFileName = "";    
180                                 compositeNamesFileName = "";    
181                         }
182                         else{
183                                 ofstream temp;
184                 
185                 string thisoutputDir = outputDir;
186                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
187                 
188                                 //we want to rip off .files, and also .flow if its there
189                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
190                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
191                 string extension = m->getExtension(fileroot);
192                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
193                 else { fileroot += "."; } //add back if needed
194                 
195                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
196                                 m->openOutputFile(compositeFASTAFileName, temp);
197                                 temp.close();
198                                 
199                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
200                                 m->openOutputFile(compositeNamesFileName, temp);
201                                 temp.close();
202                         }
203             
204             if(flowFilesFileName != "not found"){
205                 string fName;
206                 
207                 ifstream flowFilesFile;
208                 m->openInputFile(flowFilesFileName, flowFilesFile);
209                 while(flowFilesFile){
210                     fName = m->getline(flowFilesFile);
211                     
212                     //test if file is valid
213                     ifstream in;
214                     int ableToOpen = m->openInputFile(fName, in, "noerror");
215                     in.close(); 
216                     if (ableToOpen == 1) {
217                         if (inputDir != "") { //default path is set
218                             string tryPath = inputDir + fName;
219                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
220                             ifstream in2;
221                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
222                             in2.close();
223                             fName = tryPath;
224                         }
225                     }
226                     
227                     if (ableToOpen == 1) {
228                         if (m->getDefaultPath() != "") { //default path is set
229                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
230                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
231                             ifstream in2;
232                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
233                             in2.close();
234                             fName = tryPath;
235                         }
236                     }
237                     
238                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
239                     if (ableToOpen == 1) {
240                         string exepath = m->argv;
241                         string tempPath = exepath;
242                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
243                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
244                         
245                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
246                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
247                         ifstream in2;
248                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
249                         in2.close();
250                         fName = tryPath;
251                     }
252                     
253                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
254                     else { flowFileVector.push_back(fName); }
255                     m->gobble(flowFilesFile);
256                 }
257                 flowFilesFile.close();
258                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
259             }
260             else{
261                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
262                 flowFileVector.push_back(flowFileName);
263             }
264                 
265                         //check for optional parameter and set defaults
266                         // ...at some point should added some additional type checking...
267                         string temp;
268                         temp = validParameter.validFile(parameters, "lookup", true);
269                         if (temp == "not found")        {       
270                                 lookupFileName = "LookUp_Titanium.pat"; 
271                                 
272                                 int ableToOpen;
273                                 ifstream in;
274                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
275                                 in.close();     
276                                 
277                                 //if you can't open it, try input location
278                                 if (ableToOpen == 1) {
279                                         if (inputDir != "") { //default path is set
280                                                 string tryPath = inputDir + lookupFileName;
281                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
282                                                 ifstream in2;
283                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
284                                                 in2.close();
285                                                 lookupFileName = tryPath;
286                                         }
287                                 }
288                                 
289                                 //if you can't open it, try default location
290                                 if (ableToOpen == 1) {
291                                         if (m->getDefaultPath() != "") { //default path is set
292                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
293                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
294                                                 ifstream in2;
295                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
296                                                 in2.close();
297                                                 lookupFileName = tryPath;
298                                         }
299                                 }
300                                 
301                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
302                                 if (ableToOpen == 1) {
303                                         string exepath = m->argv;
304                                         string tempPath = exepath;
305                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
306                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
307                                         
308                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
309                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
310                                         ifstream in2;
311                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
312                                         in2.close();
313                                         lookupFileName = tryPath;
314                                 }
315                                 
316                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
317                         }
318                         else if(temp == "not open")     {       
319                                 
320                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
321                                 
322                                 //if you can't open it its not inputDir, try mothur excutable location
323                                 string exepath = m->argv;
324                                 string tempPath = exepath;
325                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
326                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
327                                         
328                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
329                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
330                                 ifstream in2;
331                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
332                                 in2.close();
333                                 lookupFileName = tryPath;
334                                 
335                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
336                         }else                                           {       lookupFileName = temp;  }
337                         
338                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
339                         m->setProcessors(temp);
340                         m->mothurConvert(temp, processors);
341
342                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
343                         m->mothurConvert(temp, cutoff); 
344                         
345                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
346                         m->mothurConvert(temp, minDelta); 
347
348                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
349                         m->mothurConvert(temp, maxIters); 
350             
351             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
352                         m->mothurConvert(temp, largeSize); 
353             if (largeSize != 0) { large = true; }
354             else { large = false;  }
355             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
356             
357 #ifdef USE_MPI
358             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
359 #endif
360             
361
362                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
363                         m->mothurConvert(temp, sigma); 
364                         
365                         flowOrder = validParameter.validFile(parameters, "order", false);
366                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
367                         else if(flowOrder.length() != 4){
368                                 m->mothurOut("The value of the order option must be four bases long\n");
369                         }
370                         
371                 }
372 #ifdef USE_MPI
373                 }                               
374 #endif
375         }
376         catch(exception& e) {
377                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
378                 exit(1);
379         }
380 }
381 //**********************************************************************************************************************
382 #ifdef USE_MPI
383 int ShhherCommand::execute(){
384         try {
385                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
386                 
387                 int tag = 1976;
388                 MPI_Status status; 
389                         
390                 if(pid == 0){
391
392                         for(int i=1;i<ncpus;i++){
393                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
394                         }
395                         if(abort == 1){ return 0;       }
396
397                         processors = ncpus;
398                         
399                         m->mothurOut("\nGetting preliminary data...\n");
400                         getSingleLookUp();      if (m->control_pressed) { return 0; }
401                         getJointLookUp();       if (m->control_pressed) { return 0; }
402                         
403             vector<string> flowFileVector;
404                         if(flowFilesFileName != "not found"){
405                                 string fName;
406                 
407                                 ifstream flowFilesFile;
408                                 m->openInputFile(flowFilesFileName, flowFilesFile);
409                                 while(flowFilesFile){
410                                         fName = m->getline(flowFilesFile);
411                                         flowFileVector.push_back(fName);
412                                         m->gobble(flowFilesFile);
413                                 }
414                         }
415                         else{
416                                 flowFileVector.push_back(flowFileName);
417                         }
418             
419                         int numFiles = flowFileVector.size();
420
421                         for(int i=1;i<ncpus;i++){
422                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
423                         }
424                         
425                         for(int i=0;i<numFiles;i++){
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 double begClock = clock();
430                                 unsigned long long begTime = time(NULL);
431
432                                 flowFileName = flowFileVector[i];
433                                 
434                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
435                                 m->mothurOut("Reading flowgrams...\n");
436                                 getFlowData();
437                                 
438                                 if (m->control_pressed) { break; }
439
440                                 m->mothurOut("Identifying unique flowgrams...\n");
441                                 getUniques();
442                                 
443                                 if (m->control_pressed) { break; }
444
445                                 m->mothurOut("Calculating distances between flowgrams...\n");
446                                 char fileName[1024];
447                                 strcpy(fileName, flowFileName.c_str());
448
449                                 for(int i=1;i<ncpus;i++){
450                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
451
452                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
453                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
455                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
456                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
457                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
458                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
459                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
460                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
462                                 }                               
463                                                         
464                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
465                                 
466                                 if (m->control_pressed) { break; }
467                                 
468                                 int done;
469                                 for(int i=1;i<ncpus;i++){
470                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
471                                         
472                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
473                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
474                                 }
475
476                                 string namesFileName = createNamesFile();
477                                 
478                                 if (m->control_pressed) { break; }
479                                 
480                                 m->mothurOut("\nClustering flowgrams...\n");
481                                 string listFileName = cluster(distFileName, namesFileName);
482                                 
483                                 if (m->control_pressed) { break; }
484                                 
485                 
486                 
487                                 getOTUData(listFileName);
488
489                                 m->mothurRemove(distFileName);
490                                 m->mothurRemove(namesFileName);
491                                 m->mothurRemove(listFileName);
492                                 
493                                 if (m->control_pressed) { break; }
494                                 
495                                 initPyroCluster();
496
497                                 if (m->control_pressed) { break; }
498                                 
499             
500                                 for(int i=1;i<ncpus;i++){
501                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
502                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
503                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
504                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
505                                 }
506                                 
507                                 if (m->control_pressed) { break; }
508                                 
509                                 double maxDelta = 0;
510                                 int iter = 0;
511                                 
512                                 int numOTUsOnCPU = numOTUs / ncpus;
513                                 int numSeqsOnCPU = numSeqs / ncpus;
514                                 m->mothurOut("\nDenoising flowgrams...\n");
515                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
516                                 
517                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
518                                         
519                                         double cycClock = clock();
520                                         unsigned long long cycTime = time(NULL);
521                                         fill();
522                                         
523                                         if (m->control_pressed) { break; }
524
525                                         int total = singleTau.size();
526                                         for(int i=1;i<ncpus;i++){
527                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
528                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
529                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
530                                                 
531                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
532                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
533                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
534                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
535                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
536                                         }
537                                         
538                                         calcCentroidsDriver(0, numOTUsOnCPU);
539                                         
540                                         for(int i=1;i<ncpus;i++){
541                                                 int otuStart = i * numOTUs / ncpus;
542                                                 int otuStop = (i + 1) * numOTUs / ncpus;
543                                                 
544                                                 vector<int> tempCentroids(numOTUs, 0);
545                                                 vector<short> tempChange(numOTUs, 0);
546                                                 
547                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
549                                                 
550                                                 for(int j=otuStart;j<otuStop;j++){
551                                                         centroids[j] = tempCentroids[j];
552                                                         change[j] = tempChange[j];
553                                                 }
554                                         }
555                                                                         
556                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
557                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
558                                         checkCentroids(); if (m->control_pressed) { break; }
559                                         
560                                         for(int i=1;i<ncpus;i++){
561                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
562                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
563                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
564                                         }
565                                         
566                                         calcNewDistancesParent(0, numSeqsOnCPU);
567
568                                         total = singleTau.size();
569
570                                         for(int i=1;i<ncpus;i++){
571                                                 int childTotal;
572                                                 int seqStart = i * numSeqs / ncpus;
573                                                 int seqStop = (i + 1) * numSeqs / ncpus;
574                                                 
575                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
576
577                                                 vector<int> childSeqIndex(childTotal, 0);
578                                                 vector<double> childSingleTau(childTotal, 0);
579                                                 vector<double> childDist(numSeqs * numOTUs, 0);
580                                                 vector<int> otuIndex(childTotal, 0);
581                                                 
582                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
583                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
584                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
585                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
586
587                                                 int oldTotal = total;
588                                                 total += childTotal;
589                                                 singleTau.resize(total, 0);
590                                                 seqIndex.resize(total, 0);
591                                                 seqNumber.resize(total, 0);
592                                                 
593                                                 int childIndex = 0;
594                                                 
595                                                 for(int j=oldTotal;j<total;j++){
596                                                         int otuI = otuIndex[childIndex];
597                                                         int seqI = childSeqIndex[childIndex];
598                                                         
599                                                         singleTau[j] = childSingleTau[childIndex];
600                                                         
601                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
602                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
603                                                         nSeqsPerOTU[otuI]++;
604                                                         childIndex++;
605                                                 }
606                                                 
607                                                 int index = seqStart * numOTUs;
608                                                 for(int j=seqStart;j<seqStop;j++){
609                                                         for(int k=0;k<numOTUs;k++){
610                                                                 dist[index] = childDist[index];
611                                                                 index++;
612                                                         }
613                                                 }                                       
614                                         }
615                                         
616                                         iter++;
617                                         
618                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
619
620                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
621                                                 int live = 1;
622                                                 for(int i=1;i<ncpus;i++){
623                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
624                                                 }
625                                         }
626                                         else{
627                                                 int live = 0;
628                                                 for(int i=1;i<ncpus;i++){
629                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
630                                                 }
631                                         }
632                                         
633                                 }       
634                                 
635                                 if (m->control_pressed) { break; }
636                                 
637                                 m->mothurOut("\nFinalizing...\n");
638                                 fill();
639                                 
640                                 if (m->control_pressed) { break; }
641                                 
642                                 setOTUs();
643                                 
644                                 vector<int> otuCounts(numOTUs, 0);
645                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
646                                 calcCentroidsDriver(0, numOTUs);
647                                 
648                                 if (m->control_pressed) { break; }
649                                 
650                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
651                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
652                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
653                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
654                                 writeGroups();                          if (m->control_pressed) { break; }
655                                 
656                                                                  
657                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
658                         }
659                 }
660                 else{
661                         int abort = 1;
662
663                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
664                         if(abort){      return 0;       }
665
666                         int numFiles;
667                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
668
669                         for(int i=0;i<numFiles;i++){
670                                 
671                                 if (m->control_pressed) { break; }
672                                 
673                                 //Now into the pyrodist part
674                                 bool live = 1;
675
676                                 char fileName[1024];
677                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
678                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
679                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
680                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
681                                 
682                                 flowDataIntI.resize(numSeqs * numFlowCells);
683                                 flowDataPrI.resize(numSeqs * numFlowCells);
684                                 mapUniqueToSeq.resize(numSeqs);
685                                 mapSeqToUnique.resize(numSeqs);
686                                 lengths.resize(numSeqs);
687                                 jointLookUp.resize(NUMBINS * NUMBINS);
688
689                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
690                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
693                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
694                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
695                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
696
697                                 flowFileName = string(fileName);
698                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
699                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
700                                 
701                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
702                                 
703                                 if (m->control_pressed) { break; }
704                                 
705                                 int done = 1;
706                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
707
708                                 //Now into the pyronoise part
709                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 
711                                 singleLookUp.resize(HOMOPS * NUMBINS);
712                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
713                                 weight.resize(numOTUs);
714                                 centroids.resize(numOTUs);
715                                 change.resize(numOTUs);
716                                 dist.assign(numOTUs * numSeqs, 0);
717                                 nSeqsPerOTU.resize(numOTUs);
718                                 cumNumSeqs.resize(numOTUs);
719
720                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
721                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
722                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
723                                 
724                                 int startOTU = pid * numOTUs / ncpus;
725                                 int endOTU = (pid + 1) * numOTUs / ncpus;
726
727                                 int startSeq = pid * numSeqs / ncpus;
728                                 int endSeq = (pid + 1) * numSeqs /ncpus;
729                                 
730                                 int total;
731
732                                 while(live){
733                                         
734                                         if (m->control_pressed) { break; }
735                                         
736                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
737                                         singleTau.assign(total, 0.0000);
738                                         seqNumber.assign(total, 0);
739                                         seqIndex.assign(total, 0);
740
741                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
742                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
743                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
744                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
745                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
746                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
747                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
748
749                                         calcCentroidsDriver(startOTU, endOTU);
750
751                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
752                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
753
754                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
756                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
757
758                                         vector<int> otuIndex(total, 0);
759                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
760                                         total = otuIndex.size();
761                                         
762                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
763                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
764                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
765                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
766                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
767                                 
768                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
769                                 }
770                         }
771                 }
772                 
773                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
774                 
775                 MPI_Barrier(MPI_COMM_WORLD);
776
777                 
778                 if(compositeFASTAFileName != ""){
779                         outputNames.push_back(compositeFASTAFileName);
780                         outputNames.push_back(compositeNamesFileName);
781                 }
782
783                 m->mothurOutEndLine();
784                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
785                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
786                 m->mothurOutEndLine();
787                 
788                 return 0;
789
790         }
791         catch(exception& e) {
792                 m->errorOut(e, "ShhherCommand", "execute");
793                 exit(1);
794         }
795 }
796 /**************************************************************************************************/
797 string ShhherCommand::createNamesFile(){
798         try{
799                 
800                 vector<string> duplicateNames(numUniques, "");
801                 for(int i=0;i<numSeqs;i++){
802                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
803                 }
804                 
805                 string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
806                 
807                 ofstream nameFile;
808                 m->openOutputFile(nameFileName, nameFile);
809                 
810                 for(int i=0;i<numUniques;i++){
811                         
812                         if (m->control_pressed) { break; }
813                         
814             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
815                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
816                 }
817                 
818                 nameFile.close();
819                 return  nameFileName;
820         }
821         catch(exception& e) {
822                 m->errorOut(e, "ShhherCommand", "createNamesFile");
823                 exit(1);
824         }
825 }
826 /**************************************************************************************************/
827
828 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
829         try{            
830                 ostringstream outStream;
831                 outStream.setf(ios::fixed, ios::floatfield);
832                 outStream.setf(ios::dec, ios::basefield);
833                 outStream.setf(ios::showpoint);
834                 outStream.precision(6);
835                 
836                 int begTime = time(NULL);
837                 double begClock = clock();
838                 
839                 for(int i=startSeq;i<stopSeq;i++){
840                         
841                         if (m->control_pressed) { break; }
842                         
843                         for(int j=0;j<i;j++){
844                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
845                                 
846                                 if(flowDistance < 1e-6){
847                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
848                                 }
849                                 else if(flowDistance <= cutoff){
850                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
851                                 }
852                         }
853                         if(i % 100 == 0){
854                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
855                         }
856                 }
857                 
858                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
859                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
860                 
861                 if (m->control_pressed) { return fDistFileName; }
862                 
863                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
864
865                 ofstream distFile(fDistFileName.c_str());
866                 distFile << outStream.str();            
867                 distFile.close();
868                 
869                 return fDistFileName;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
873                 exit(1);
874         }
875 }
876 /**************************************************************************************************/
877  
878 void ShhherCommand::getOTUData(string listFileName){
879     try {
880         
881         ifstream listFile;
882         m->openInputFile(listFileName, listFile);
883         string label;
884         
885         listFile >> label >> numOTUs;
886         
887         otuData.assign(numSeqs, 0);
888         cumNumSeqs.assign(numOTUs, 0);
889         nSeqsPerOTU.assign(numOTUs, 0);
890         aaP.clear();aaP.resize(numOTUs);
891         
892         seqNumber.clear();
893         aaI.clear();
894         seqIndex.clear();
895         
896         string singleOTU = "";
897         
898         for(int i=0;i<numOTUs;i++){
899             
900             if (m->control_pressed) { break; }
901             
902             listFile >> singleOTU;
903             
904             istringstream otuString(singleOTU);
905             
906             while(otuString){
907                 
908                 string seqName = "";
909                 
910                 for(int j=0;j<singleOTU.length();j++){
911                     char letter = otuString.get();
912                     
913                     if(letter != ','){
914                         seqName += letter;
915                     }
916                     else{
917                         map<string,int>::iterator nmIt = nameMap.find(seqName);
918                         int index = nmIt->second;
919                         
920                         nameMap.erase(nmIt);
921                         
922                         otuData[index] = i;
923                         nSeqsPerOTU[i]++;
924                         aaP[i].push_back(index);
925                         seqName = "";
926                     }
927                 }
928                 
929                 map<string,int>::iterator nmIt = nameMap.find(seqName);
930                 
931                 int index = nmIt->second;
932                 nameMap.erase(nmIt);
933                 
934                 otuData[index] = i;
935                 nSeqsPerOTU[i]++;
936                 aaP[i].push_back(index);        
937                 
938                 otuString.get();
939             }
940             
941             sort(aaP[i].begin(), aaP[i].end());
942             for(int j=0;j<nSeqsPerOTU[i];j++){
943                 seqNumber.push_back(aaP[i][j]);
944             }
945             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
946                 aaP[i].push_back(0);
947             }
948             
949             
950         }
951         
952         for(int i=1;i<numOTUs;i++){
953             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
954         }
955         aaI = aaP;
956         seqIndex = seqNumber;
957         
958         listFile.close();       
959         
960     }
961     catch(exception& e) {
962         m->errorOut(e, "ShhherCommand", "getOTUData");
963         exit(1);        
964     }           
965 }
966
967 /**************************************************************************************************/
968
969 void ShhherCommand::initPyroCluster(){                          
970     try{
971         if (numOTUs < processors) { processors = 1; }
972         
973         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
974         
975         dist.assign(numSeqs * numOTUs, 0);
976         change.assign(numOTUs, 1);
977         centroids.assign(numOTUs, -1);
978         weight.assign(numOTUs, 0);
979         singleTau.assign(numSeqs, 1.0);
980         
981         nSeqsBreaks.assign(processors+1, 0);
982         nOTUsBreaks.assign(processors+1, 0);
983         
984         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
985         
986         nSeqsBreaks[0] = 0;
987         for(int i=0;i<processors;i++){
988             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
989             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
990         }
991         nSeqsBreaks[processors] = numSeqs;
992         nOTUsBreaks[processors] = numOTUs;
993     }
994     catch(exception& e) {
995         m->errorOut(e, "ShhherCommand", "initPyroCluster");
996         exit(1);        
997     }
998 }
999
1000 /**************************************************************************************************/
1001
1002 void ShhherCommand::fill(){
1003     try {
1004         int index = 0;
1005         for(int i=0;i<numOTUs;i++){
1006             
1007             if (m->control_pressed) { break; }
1008             
1009             cumNumSeqs[i] = index;
1010             for(int j=0;j<nSeqsPerOTU[i];j++){
1011                 seqNumber[index] = aaP[i][j];
1012                 seqIndex[index] = aaI[i][j];
1013                 
1014                 index++;
1015             }
1016         }
1017     }
1018     catch(exception& e) {
1019         m->errorOut(e, "ShhherCommand", "fill");
1020         exit(1);        
1021     }           
1022 }
1023
1024 /**************************************************************************************************/
1025  
1026 void ShhherCommand::getFlowData(){
1027     try{
1028         ifstream flowFile;
1029         m->openInputFile(flowFileName, flowFile);
1030         
1031         string seqName;
1032         seqNameVector.clear();
1033         lengths.clear();
1034         flowDataIntI.clear();
1035         nameMap.clear();
1036         
1037         
1038         int currentNumFlowCells;
1039         
1040         float intensity;
1041         
1042         flowFile >> numFlowCells;
1043         int index = 0;//pcluster
1044         while(!flowFile.eof()){
1045             
1046             if (m->control_pressed) { break; }
1047             
1048             flowFile >> seqName >> currentNumFlowCells;
1049             lengths.push_back(currentNumFlowCells);
1050             
1051             seqNameVector.push_back(seqName);
1052             nameMap[seqName] = index++;//pcluster
1053             
1054             for(int i=0;i<numFlowCells;i++){
1055                 flowFile >> intensity;
1056                 if(intensity > 9.99)    {       intensity = 9.99;       }
1057                 int intI = int(100 * intensity + 0.0001);
1058                 flowDataIntI.push_back(intI);
1059             }
1060             m->gobble(flowFile);
1061         }
1062         flowFile.close();
1063         
1064         numSeqs = seqNameVector.size();         
1065         
1066         for(int i=0;i<numSeqs;i++){
1067             
1068             if (m->control_pressed) { break; }
1069             
1070             int iNumFlowCells = i * numFlowCells;
1071             for(int j=lengths[i];j<numFlowCells;j++){
1072                 flowDataIntI[iNumFlowCells + j] = 0;
1073             }
1074         }
1075         
1076     }
1077     catch(exception& e) {
1078         m->errorOut(e, "ShhherCommand", "getFlowData");
1079         exit(1);
1080     }
1081 }
1082 /**************************************************************************************************/
1083 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1084         
1085         try{
1086                 vector<double> newTau(numOTUs,0);
1087                 vector<double> norms(numSeqs, 0);
1088                 otuIndex.clear();
1089                 seqIndex.clear();
1090                 singleTau.clear();
1091                 
1092                 for(int i=startSeq;i<stopSeq;i++){
1093                         
1094                         if (m->control_pressed) { break; }
1095                         
1096                         double offset = 1e8;
1097                         int indexOffset = i * numOTUs;
1098                         
1099                         for(int j=0;j<numOTUs;j++){
1100                                 
1101                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1102                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1103                                 }
1104                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1105                                         offset = dist[indexOffset + j];
1106                                 }
1107                         }
1108                         
1109                         for(int j=0;j<numOTUs;j++){
1110                                 if(weight[j] > MIN_WEIGHT){
1111                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1112                                         norms[i] += newTau[j];
1113                                 }
1114                                 else{
1115                                         newTau[j] = 0.0;
1116                                 }
1117                         }
1118                         
1119                         for(int j=0;j<numOTUs;j++){
1120                 
1121                                 newTau[j] /= norms[i];
1122                                 
1123                                 if(newTau[j] > MIN_TAU){
1124                                         otuIndex.push_back(j);
1125                                         seqIndex.push_back(i);
1126                                         singleTau.push_back(newTau[j]);
1127                                 }
1128                         }
1129                         
1130                 }
1131         }
1132         catch(exception& e) {
1133                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1134                 exit(1);        
1135         }               
1136 }
1137
1138 /**************************************************************************************************/
1139
1140 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1141     
1142     try{
1143         
1144         int total = 0;
1145         vector<double> newTau(numOTUs,0);
1146         vector<double> norms(numSeqs, 0);
1147         nSeqsPerOTU.assign(numOTUs, 0);
1148         
1149         for(int i=startSeq;i<stopSeq;i++){
1150             
1151             if (m->control_pressed) { break; }
1152             
1153             int indexOffset = i * numOTUs;
1154             
1155             double offset = 1e8;
1156             
1157             for(int j=0;j<numOTUs;j++){
1158                 
1159                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1160                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1161                 }
1162                 
1163                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1164                     offset = dist[indexOffset + j];
1165                 }
1166             }
1167             
1168             for(int j=0;j<numOTUs;j++){
1169                 if(weight[j] > MIN_WEIGHT){
1170                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1171                     norms[i] += newTau[j];
1172                 }
1173                 else{
1174                     newTau[j] = 0.0;
1175                 }
1176             }
1177             
1178             for(int j=0;j<numOTUs;j++){
1179                 newTau[j] /= norms[i];
1180             }
1181             
1182             for(int j=0;j<numOTUs;j++){
1183                 if(newTau[j] > MIN_TAU){
1184                     
1185                     int oldTotal = total;
1186                     
1187                     total++;
1188                     
1189                     singleTau.resize(total, 0);
1190                     seqNumber.resize(total, 0);
1191                     seqIndex.resize(total, 0);
1192                     
1193                     singleTau[oldTotal] = newTau[j];
1194                     
1195                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1196                     aaI[j][nSeqsPerOTU[j]] = i;
1197                     nSeqsPerOTU[j]++;
1198                 }
1199             }
1200             
1201         }
1202         
1203     }
1204     catch(exception& e) {
1205         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1206         exit(1);        
1207     }           
1208 }
1209
1210 /**************************************************************************************************/
1211
1212 void ShhherCommand::setOTUs(){
1213     
1214     try {
1215         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1216         
1217         for(int i=0;i<numOTUs;i++){
1218             
1219             if (m->control_pressed) { break; }
1220             
1221             for(int j=0;j<nSeqsPerOTU[i];j++){
1222                 int index = cumNumSeqs[i] + j;
1223                 double tauValue = singleTau[seqNumber[index]];
1224                 int sIndex = seqIndex[index];
1225                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1226             }
1227         }
1228         
1229         for(int i=0;i<numSeqs;i++){
1230             double maxTau = -1.0000;
1231             int maxOTU = -1;
1232             for(int j=0;j<numOTUs;j++){
1233                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1234                     maxTau = bigTauMatrix[i * numOTUs + j];
1235                     maxOTU = j;
1236                 }
1237             }
1238             
1239             otuData[i] = maxOTU;
1240         }
1241         
1242         nSeqsPerOTU.assign(numOTUs, 0);         
1243         
1244         for(int i=0;i<numSeqs;i++){
1245             int index = otuData[i];
1246             
1247             singleTau[i] = 1.0000;
1248             dist[i] = 0.0000;
1249             
1250             aaP[index][nSeqsPerOTU[index]] = i;
1251             aaI[index][nSeqsPerOTU[index]] = i;
1252             
1253             nSeqsPerOTU[index]++;
1254         }
1255         fill(); 
1256     }
1257     catch(exception& e) {
1258         m->errorOut(e, "ShhherCommand", "setOTUs");
1259         exit(1);        
1260     }           
1261 }
1262
1263 /**************************************************************************************************/
1264  
1265 void ShhherCommand::getUniques(){
1266     try{
1267         
1268         
1269         numUniques = 0;
1270         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1271         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1272         uniqueLengths.assign(numSeqs, 0);
1273         mapSeqToUnique.assign(numSeqs, -1);
1274         mapUniqueToSeq.assign(numSeqs, -1);
1275         
1276         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1277         
1278         for(int i=0;i<numSeqs;i++){
1279             
1280             if (m->control_pressed) { break; }
1281             
1282             int index = 0;
1283             
1284             vector<short> current(numFlowCells);
1285             for(int j=0;j<numFlowCells;j++){
1286                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1287             }
1288             
1289             for(int j=0;j<numUniques;j++){
1290                 int offset = j * numFlowCells;
1291                 bool toEnd = 1;
1292                 
1293                 int shorterLength;
1294                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1295                 else                                                            {       shorterLength = uniqueLengths[j];       }
1296                 
1297                 for(int k=0;k<shorterLength;k++){
1298                     if(current[k] != uniqueFlowgrams[offset + k]){
1299                         toEnd = 0;
1300                         break;
1301                     }
1302                 }
1303                 
1304                 if(toEnd){
1305                     mapSeqToUnique[i] = j;
1306                     uniqueCount[j]++;
1307                     index = j;
1308                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1309                     break;
1310                 }
1311                 index++;
1312             }
1313             
1314             if(index == numUniques){
1315                 uniqueLengths[numUniques] = lengths[i];
1316                 uniqueCount[numUniques] = 1;
1317                 mapSeqToUnique[i] = numUniques;//anMap
1318                 mapUniqueToSeq[numUniques] = i;//anF
1319                 
1320                 for(int k=0;k<numFlowCells;k++){
1321                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1322                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1323                 }
1324                 
1325                 numUniques++;
1326             }
1327         }
1328         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1329         uniqueLengths.resize(numUniques);       
1330         
1331         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1332         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1333     }
1334     catch(exception& e) {
1335         m->errorOut(e, "ShhherCommand", "getUniques");
1336         exit(1);
1337     }
1338 }
1339
1340 /**************************************************************************************************/
1341
1342 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1343     try{
1344         int minLength = lengths[mapSeqToUnique[seqA]];
1345         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1346         
1347         int ANumFlowCells = seqA * numFlowCells;
1348         int BNumFlowCells = seqB * numFlowCells;
1349         
1350         float dist = 0;
1351         
1352         for(int i=0;i<minLength;i++){
1353             
1354             if (m->control_pressed) { break; }
1355             
1356             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1357             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1358             
1359             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1360             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1361             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1362         }
1363         
1364         dist /= (float) minLength;
1365         return dist;
1366     }
1367     catch(exception& e) {
1368         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1369         exit(1);
1370     }
1371 }
1372
1373 //**********************************************************************************************************************/
1374
1375 string ShhherCommand::cluster(string distFileName, string namesFileName){
1376     try {
1377         
1378         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1379         read->setCutoff(cutoff);
1380         
1381         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1382         clusterNameMap->readMap();
1383         read->read(clusterNameMap);
1384         
1385         ListVector* list = read->getListVector();
1386         SparseMatrix* matrix = read->getMatrix();
1387         
1388         delete read; 
1389         delete clusterNameMap; 
1390         
1391         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1392         
1393         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1394         string tag = cluster->getTag();
1395         
1396         double clusterCutoff = cutoff;
1397         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1398             
1399             if (m->control_pressed) { break; }
1400             
1401             cluster->update(clusterCutoff);
1402         }
1403         
1404         list->setLabel(toString(cutoff));
1405         
1406         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1407         ofstream listFile;
1408         m->openOutputFile(listFileName, listFile);
1409         list->print(listFile);
1410         listFile.close();
1411         
1412         delete matrix;  delete cluster; delete rabund; delete list;
1413         
1414         return listFileName;
1415     }
1416     catch(exception& e) {
1417         m->errorOut(e, "ShhherCommand", "cluster");
1418         exit(1);        
1419     }           
1420 }
1421
1422 /**************************************************************************************************/
1423
1424 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1425     
1426     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1427     //within an otu
1428     
1429     try{
1430         
1431         for(int i=start;i<finish;i++){
1432             
1433             if (m->control_pressed) { break; }
1434             
1435             double count = 0;
1436             int position = 0;
1437             int minFlowGram = 100000000;
1438             double minFlowValue = 1e8;
1439             change[i] = 0; //FALSE
1440             
1441             for(int j=0;j<nSeqsPerOTU[i];j++){
1442                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1443             }
1444             
1445             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1446                 vector<double> adF(nSeqsPerOTU[i]);
1447                 vector<int> anL(nSeqsPerOTU[i]);
1448                 
1449                 for(int j=0;j<nSeqsPerOTU[i];j++){
1450                     int index = cumNumSeqs[i] + j;
1451                     int nI = seqIndex[index];
1452                     int nIU = mapSeqToUnique[nI];
1453                     
1454                     int k;
1455                     for(k=0;k<position;k++){
1456                         if(nIU == anL[k]){
1457                             break;
1458                         }
1459                     }
1460                     if(k == position){
1461                         anL[position] = nIU;
1462                         adF[position] = 0.0000;
1463                         position++;
1464                     }                                           
1465                 }
1466                 
1467                 for(int j=0;j<nSeqsPerOTU[i];j++){
1468                     int index = cumNumSeqs[i] + j;
1469                     int nI = seqIndex[index];
1470                     
1471                     double tauValue = singleTau[seqNumber[index]];
1472                     
1473                     for(int k=0;k<position;k++){
1474                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1475                         adF[k] += dist * tauValue;
1476                     }
1477                 }
1478                 
1479                 for(int j=0;j<position;j++){
1480                     if(adF[j] < minFlowValue){
1481                         minFlowGram = j;
1482                         minFlowValue = adF[j];
1483                     }
1484                 }
1485                 
1486                 if(centroids[i] != anL[minFlowGram]){
1487                     change[i] = 1;
1488                     centroids[i] = anL[minFlowGram];
1489                 }
1490             }
1491             else if(centroids[i] != -1){
1492                 change[i] = 1;
1493                 centroids[i] = -1;                      
1494             }
1495         }
1496     }
1497     catch(exception& e) {
1498         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1499         exit(1);        
1500     }           
1501 }
1502
1503 /**************************************************************************************************/
1504
1505 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1506     try{
1507         
1508         int flowAValue = cent * numFlowCells;
1509         int flowBValue = flow * numFlowCells;
1510         
1511         double dist = 0;
1512         
1513         for(int i=0;i<length;i++){
1514             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1515             flowAValue++;
1516             flowBValue++;
1517         }
1518         
1519         return dist / (double)length;
1520     }
1521     catch(exception& e) {
1522         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1523         exit(1);        
1524     }           
1525 }
1526
1527 /**************************************************************************************************/
1528
1529 double ShhherCommand::getNewWeights(){
1530     try{
1531         
1532         double maxChange = 0;
1533         
1534         for(int i=0;i<numOTUs;i++){
1535             
1536             if (m->control_pressed) { break; }
1537             
1538             double difference = weight[i];
1539             weight[i] = 0;
1540             
1541             for(int j=0;j<nSeqsPerOTU[i];j++){
1542                 int index = cumNumSeqs[i] + j;
1543                 double tauValue = singleTau[seqNumber[index]];
1544                 weight[i] += tauValue;
1545             }
1546             
1547             difference = fabs(weight[i] - difference);
1548             if(difference > maxChange){ maxChange = difference; }
1549         }
1550         return maxChange;
1551     }
1552     catch(exception& e) {
1553         m->errorOut(e, "ShhherCommand", "getNewWeights");
1554         exit(1);        
1555     }           
1556 }
1557  
1558  /**************************************************************************************************/
1559  
1560 double ShhherCommand::getLikelihood(){
1561     
1562     try{
1563         
1564         vector<long double> P(numSeqs, 0);
1565         int effNumOTUs = 0;
1566         
1567         for(int i=0;i<numOTUs;i++){
1568             if(weight[i] > MIN_WEIGHT){
1569                 effNumOTUs++;
1570             }
1571         }
1572         
1573         string hold;
1574         for(int i=0;i<numOTUs;i++){
1575             
1576             if (m->control_pressed) { break; }
1577             
1578             for(int j=0;j<nSeqsPerOTU[i];j++){
1579                 int index = cumNumSeqs[i] + j;
1580                 int nI = seqIndex[index];
1581                 double singleDist = dist[seqNumber[index]];
1582                 
1583                 P[nI] += weight[i] * exp(-singleDist * sigma);
1584             }
1585         }
1586         double nLL = 0.00;
1587         for(int i=0;i<numSeqs;i++){
1588             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1589             
1590             nLL += -log(P[i]);
1591         }
1592         
1593         nLL = nLL -(double)numSeqs * log(sigma);
1594         
1595         return nLL; 
1596     }
1597     catch(exception& e) {
1598         m->errorOut(e, "ShhherCommand", "getNewWeights");
1599         exit(1);        
1600     }           
1601 }
1602
1603 /**************************************************************************************************/
1604
1605 void ShhherCommand::checkCentroids(){
1606     try{
1607         vector<int> unique(numOTUs, 1);
1608         
1609         for(int i=0;i<numOTUs;i++){
1610             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1611                 unique[i] = -1;
1612             }
1613         }
1614         
1615         for(int i=0;i<numOTUs;i++){
1616             
1617             if (m->control_pressed) { break; }
1618             
1619             if(unique[i] == 1){
1620                 for(int j=i+1;j<numOTUs;j++){
1621                     if(unique[j] == 1){
1622                         
1623                         if(centroids[j] == centroids[i]){
1624                             unique[j] = 0;
1625                             centroids[j] = -1;
1626                             
1627                             weight[i] += weight[j];
1628                             weight[j] = 0.0;
1629                         }
1630                     }
1631                 }
1632             }
1633         }
1634     }
1635     catch(exception& e) {
1636         m->errorOut(e, "ShhherCommand", "checkCentroids");
1637         exit(1);        
1638     }           
1639 }
1640  /**************************************************************************************************/
1641
1642
1643  
1644 void ShhherCommand::writeQualities(vector<int> otuCounts){
1645     
1646     try {
1647         string thisOutputDir = outputDir;
1648         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1649         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
1650         
1651         ofstream qualityFile;
1652         m->openOutputFile(qualityFileName, qualityFile);
1653         
1654         qualityFile.setf(ios::fixed, ios::floatfield);
1655         qualityFile.setf(ios::showpoint);
1656         qualityFile << setprecision(6);
1657         
1658         vector<vector<int> > qualities(numOTUs);
1659         vector<double> pr(HOMOPS, 0);
1660         
1661         
1662         for(int i=0;i<numOTUs;i++){
1663             
1664             if (m->control_pressed) { break; }
1665             
1666             int index = 0;
1667             int base = 0;
1668             
1669             if(nSeqsPerOTU[i] > 0){
1670                 qualities[i].assign(1024, -1);
1671                 
1672                 while(index < numFlowCells){
1673                     double maxPrValue = 1e8;
1674                     short maxPrIndex = -1;
1675                     double count = 0.0000;
1676                     
1677                     pr.assign(HOMOPS, 0);
1678                     
1679                     for(int j=0;j<nSeqsPerOTU[i];j++){
1680                         int lIndex = cumNumSeqs[i] + j;
1681                         double tauValue = singleTau[seqNumber[lIndex]];
1682                         int sequenceIndex = aaI[i][j];
1683                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1684                         
1685                         count += tauValue;
1686                         
1687                         for(int s=0;s<HOMOPS;s++){
1688                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1689                         }
1690                     }
1691                     
1692                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1693                     maxPrValue = pr[maxPrIndex];
1694                     
1695                     if(count > MIN_COUNT){
1696                         double U = 0.0000;
1697                         double norm = 0.0000;
1698                         
1699                         for(int s=0;s<HOMOPS;s++){
1700                             norm += exp(-(pr[s] - maxPrValue));
1701                         }
1702                         
1703                         for(int s=1;s<=maxPrIndex;s++){
1704                             int value = 0;
1705                             double temp = 0.0000;
1706                             
1707                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1708                             
1709                             if(U>0.00){
1710                                 temp = log10(U);
1711                             }
1712                             else{
1713                                 temp = -10.1;
1714                             }
1715                             temp = floor(-10 * temp);
1716                             value = (int)floor(temp);
1717                             if(value > 100){    value = 100;    }
1718                             
1719                             qualities[i][base] = (int)value;
1720                             base++;
1721                         }
1722                     }
1723                     
1724                     index++;
1725                 }
1726             }
1727             
1728             
1729             if(otuCounts[i] > 0){
1730                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1731                 
1732                 int j=4;        //need to get past the first four bases
1733                 while(qualities[i][j] != -1){
1734                     qualityFile << qualities[i][j] << ' ';
1735                     j++;
1736                 }
1737                 qualityFile << endl;
1738             }
1739         }
1740         qualityFile.close();
1741         outputNames.push_back(qualityFileName);
1742         
1743     }
1744     catch(exception& e) {
1745         m->errorOut(e, "ShhherCommand", "writeQualities");
1746         exit(1);        
1747     }           
1748 }
1749
1750 /**************************************************************************************************/
1751
1752 void ShhherCommand::writeSequences(vector<int> otuCounts){
1753     try {
1754         string thisOutputDir = outputDir;
1755         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1756         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
1757         ofstream fastaFile;
1758         m->openOutputFile(fastaFileName, fastaFile);
1759         
1760         vector<string> names(numOTUs, "");
1761         
1762         for(int i=0;i<numOTUs;i++){
1763             
1764             if (m->control_pressed) { break; }
1765             
1766             int index = centroids[i];
1767             
1768             if(otuCounts[i] > 0){
1769                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1770                 
1771                 string newSeq = "";
1772                 
1773                 for(int j=0;j<numFlowCells;j++){
1774                     
1775                     char base = flowOrder[j % 4];
1776                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1777                         newSeq += base;
1778                     }
1779                 }
1780                 
1781                 fastaFile << newSeq.substr(4) << endl;
1782             }
1783         }
1784         fastaFile.close();
1785         
1786         outputNames.push_back(fastaFileName);
1787         
1788         if(compositeFASTAFileName != ""){
1789             m->appendFiles(fastaFileName, compositeFASTAFileName);
1790         }
1791     }
1792     catch(exception& e) {
1793         m->errorOut(e, "ShhherCommand", "writeSequences");
1794         exit(1);        
1795     }           
1796 }
1797
1798 /**************************************************************************************************/
1799
1800 void ShhherCommand::writeNames(vector<int> otuCounts){
1801     try {
1802         string thisOutputDir = outputDir;
1803         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1804         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
1805         ofstream nameFile;
1806         m->openOutputFile(nameFileName, nameFile);
1807         
1808         for(int i=0;i<numOTUs;i++){
1809             
1810             if (m->control_pressed) { break; }
1811             
1812             if(otuCounts[i] > 0){
1813                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1814                 
1815                 for(int j=1;j<nSeqsPerOTU[i];j++){
1816                     nameFile << ',' << seqNameVector[aaI[i][j]];
1817                 }
1818                 
1819                 nameFile << endl;
1820             }
1821         }
1822         nameFile.close();
1823         outputNames.push_back(nameFileName);
1824         
1825         
1826         if(compositeNamesFileName != ""){
1827             m->appendFiles(nameFileName, compositeNamesFileName);
1828         }               
1829     }
1830     catch(exception& e) {
1831         m->errorOut(e, "ShhherCommand", "writeNames");
1832         exit(1);        
1833     }           
1834 }
1835
1836 /**************************************************************************************************/
1837
1838 void ShhherCommand::writeGroups(){
1839     try {
1840         string thisOutputDir = outputDir;
1841         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1842         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1843         int pos = fileRoot.find_first_of('.');
1844         string fileGroup = fileRoot;
1845         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1846         string groupFileName = thisOutputDir + fileRoot + getOutputFileNameTag("group");
1847         ofstream groupFile;
1848         m->openOutputFile(groupFileName, groupFile);
1849         
1850         for(int i=0;i<numSeqs;i++){
1851             if (m->control_pressed) { break; }
1852             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1853         }
1854         groupFile.close();
1855         outputNames.push_back(groupFileName);
1856         
1857     }
1858     catch(exception& e) {
1859         m->errorOut(e, "ShhherCommand", "writeGroups");
1860         exit(1);        
1861     }           
1862 }
1863
1864 /**************************************************************************************************/
1865
1866 void ShhherCommand::writeClusters(vector<int> otuCounts){
1867     try {
1868         string thisOutputDir = outputDir;
1869         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1870         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) +getOutputFileNameTag("counts");
1871         ofstream otuCountsFile;
1872         m->openOutputFile(otuCountsFileName, otuCountsFile);
1873         
1874         string bases = flowOrder;
1875         
1876         for(int i=0;i<numOTUs;i++){
1877             
1878             if (m->control_pressed) {
1879                 break;
1880             }
1881             //output the translated version of the centroid sequence for the otu
1882             if(otuCounts[i] > 0){
1883                 int index = centroids[i];
1884                 
1885                 otuCountsFile << "ideal\t";
1886                 for(int j=8;j<numFlowCells;j++){
1887                     char base = bases[j % 4];
1888                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1889                         otuCountsFile << base;
1890                     }
1891                 }
1892                 otuCountsFile << endl;
1893                 
1894                 for(int j=0;j<nSeqsPerOTU[i];j++){
1895                     int sequence = aaI[i][j];
1896                     otuCountsFile << seqNameVector[sequence] << '\t';
1897                     
1898                     string newSeq = "";
1899                     
1900                     for(int k=0;k<lengths[sequence];k++){
1901                         char base = bases[k % 4];
1902                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1903                         
1904                         for(int s=0;s<freq;s++){
1905                             newSeq += base;
1906                             //otuCountsFile << base;
1907                         }
1908                     }
1909                     otuCountsFile << newSeq.substr(4) << endl;
1910                 }
1911                 otuCountsFile << endl;
1912             }
1913         }
1914         otuCountsFile.close();
1915         outputNames.push_back(otuCountsFileName);
1916         
1917     }
1918     catch(exception& e) {
1919         m->errorOut(e, "ShhherCommand", "writeClusters");
1920         exit(1);        
1921     }           
1922 }
1923
1924 #else
1925 //**********************************************************************************************************************
1926
1927 int ShhherCommand::execute(){
1928         try {
1929                 if (abort == true) { return 0; }
1930                 
1931                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1932                 getJointLookUp();       if (m->control_pressed) { return 0; }
1933                 
1934         int numFiles = flowFileVector.size();
1935                 
1936         if (numFiles < processors) { processors = numFiles; }
1937         
1938 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1939         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1940         else { createProcesses(flowFileVector); } //each processor processes one file
1941 #else
1942         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1943 #endif
1944         
1945                 if(compositeFASTAFileName != ""){
1946                         outputNames.push_back(compositeFASTAFileName);
1947                         outputNames.push_back(compositeNamesFileName);
1948                 }
1949
1950                 m->mothurOutEndLine();
1951                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1952                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1953                 m->mothurOutEndLine();
1954                 
1955                 return 0;
1956         }
1957         catch(exception& e) {
1958                 m->errorOut(e, "ShhherCommand", "execute");
1959                 exit(1);
1960         }
1961 }
1962 #endif
1963 /**************************************************************************************************/
1964
1965 int ShhherCommand::createProcesses(vector<string> filenames){
1966     try {
1967         vector<int> processIDS;
1968                 int process = 1;
1969                 int num = 0;
1970                 
1971                 //sanity check
1972                 if (filenames.size() < processors) { processors = filenames.size(); }
1973                 
1974                 //divide the groups between the processors
1975                 vector<linePair> lines;
1976         vector<int> numFilesToComplete;
1977                 int numFilesPerProcessor = filenames.size() / processors;
1978                 for (int i = 0; i < processors; i++) {
1979                         int startIndex =  i * numFilesPerProcessor;
1980                         int endIndex = (i+1) * numFilesPerProcessor;
1981                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1982                         lines.push_back(linePair(startIndex, endIndex));
1983             numFilesToComplete.push_back((endIndex-startIndex));
1984                 }
1985                 
1986         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1987                 
1988                 //loop through and create all the processes you want
1989                 while (process != processors) {
1990                         int pid = fork();
1991                         
1992                         if (pid > 0) {
1993                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1994                                 process++;
1995                         }else if (pid == 0){
1996                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1997                 
1998                 //pass numSeqs to parent
1999                                 ofstream out;
2000                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2001                                 m->openOutputFile(tempFile, out);
2002                                 out << num << endl;
2003                                 out.close();
2004                 
2005                                 exit(0);
2006                         }else { 
2007                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2008                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2009                                 exit(0);
2010                         }
2011                 }
2012                 
2013                 //do my part
2014                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
2015                 
2016                 //force parent to wait until all the processes are done
2017                 for (int i=0;i<processIDS.size();i++) { 
2018                         int temp = processIDS[i];
2019                         wait(&temp);
2020                 }
2021         
2022         #else
2023         
2024         //////////////////////////////////////////////////////////////////////////////////////////////////////
2025         
2026         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2027         
2028         //////////////////////////////////////////////////////////////////////////////////////////////////////
2029                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2030                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2031                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2032                 
2033                 vector<shhhFlowsData*> pDataArray; 
2034                 DWORD   dwThreadIdArray[processors-1];
2035                 HANDLE  hThreadArray[processors-1]; 
2036                 
2037                 //Create processor worker threads.
2038                 for( int i=0; i<processors-1; i++ ){
2039                         // Allocate memory for thread data.
2040                         string extension = "";
2041                         if (i != 0) { extension = toString(i) + ".temp"; }
2042                         
2043             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2044                         pDataArray.push_back(tempFlow);
2045                         processIDS.push_back(i);
2046             
2047                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2048                 }
2049                 
2050                 //using the main process as a worker saves time and memory
2051                 //do my part
2052                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2053                 
2054                 //Wait until all threads have terminated.
2055                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2056                 
2057                 //Close all thread handles and free memory allocations.
2058                 for(int i=0; i < pDataArray.size(); i++){
2059                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2060                         CloseHandle(hThreadArray[i]);
2061                         delete pDataArray[i];
2062                 }
2063                 
2064         #endif
2065         
2066         for (int i=0;i<processIDS.size();i++) { 
2067             ifstream in;
2068                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2069                         m->openInputFile(tempFile, in);
2070                         if (!in.eof()) { 
2071                 int tempNum = 0; 
2072                 in >> tempNum; 
2073                 if (tempNum != numFilesToComplete[i+1]) {
2074                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2075                 }
2076             }
2077                         in.close(); m->mothurRemove(tempFile);
2078             
2079             if (compositeFASTAFileName != "") {
2080                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2081                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2082                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2083                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2084             }
2085         }
2086         
2087         return 0;
2088         
2089     }
2090         catch(exception& e) {
2091                 m->errorOut(e, "ShhherCommand", "createProcesses");
2092                 exit(1);
2093         }
2094 }
2095 /**************************************************************************************************/
2096
2097 vector<string> ShhherCommand::parseFlowFiles(string filename){
2098     try {
2099         vector<string> files;
2100         int count = 0;
2101         
2102         ifstream in;
2103         m->openInputFile(filename, in);
2104         
2105         int thisNumFLows = 0;
2106         in >> thisNumFLows; m->gobble(in);
2107         
2108         while (!in.eof()) {
2109             if (m->control_pressed) { break; }
2110             
2111             ofstream out;
2112             string outputFileName = filename + toString(count) + ".temp";
2113             m->openOutputFile(outputFileName, out);
2114             out << thisNumFLows << endl;
2115             files.push_back(outputFileName);
2116             
2117             int numLinesWrote = 0;
2118             for (int i = 0; i < largeSize; i++) {
2119                 if (in.eof()) { break; }
2120                 string line = m->getline(in); m->gobble(in);
2121                 out << line << endl;
2122                 numLinesWrote++;
2123             }
2124             out.close();
2125             
2126             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2127             count++;
2128         }
2129         in.close();
2130         
2131         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2132         
2133         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2134         
2135         return files;
2136     }
2137         catch(exception& e) {
2138                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2139                 exit(1);
2140         }
2141 }
2142 /**************************************************************************************************/
2143
2144 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2145     try {
2146         
2147         int numCompleted = 0;
2148         
2149         for(int i=start;i<end;i++){
2150                         
2151                         if (m->control_pressed) { break; }
2152                         
2153             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2154             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2155             
2156             if (m->control_pressed) { break; }
2157             
2158             double begClock = clock();
2159             unsigned long long begTime;
2160             
2161             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2162                 
2163                 string flowFileName = theseFlowFileNames[g];
2164                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2165                 m->mothurOut("Reading flowgrams...\n");
2166                 
2167                 vector<string> seqNameVector;
2168                 vector<int> lengths;
2169                 vector<short> flowDataIntI;
2170                 vector<double> flowDataPrI;
2171                 map<string, int> nameMap;
2172                 vector<short> uniqueFlowgrams;
2173                 vector<int> uniqueCount;
2174                 vector<int> mapSeqToUnique;
2175                 vector<int> mapUniqueToSeq;
2176                 vector<int> uniqueLengths;
2177                 int numFlowCells;
2178                 
2179                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2180                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2181                 
2182                 if (m->control_pressed) { break; }
2183                 
2184                 m->mothurOut("Identifying unique flowgrams...\n");
2185                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2186                 
2187                 if (m->control_pressed) { break; }
2188                 
2189                 m->mothurOut("Calculating distances between flowgrams...\n");
2190                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2191                 begTime = time(NULL);
2192                
2193                 
2194                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2195                 
2196                 m->mothurOutEndLine();
2197                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2198                 
2199                 
2200                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2201                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2202                 
2203                 if (m->control_pressed) { break; }
2204                 
2205                 m->mothurOut("\nClustering flowgrams...\n");
2206                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2207                 cluster(listFileName, distFileName, namesFileName);
2208                 
2209                 if (m->control_pressed) { break; }
2210                 
2211                 vector<int> otuData;
2212                 vector<int> cumNumSeqs;
2213                 vector<int> nSeqsPerOTU;
2214                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2215                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2216                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2217                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2218                 
2219                 
2220                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2221                 
2222                 if (m->control_pressed) { break; }
2223                 
2224                 m->mothurRemove(distFileName);
2225                 m->mothurRemove(namesFileName);
2226                 m->mothurRemove(listFileName);
2227                 
2228                 vector<double> dist;            //adDist - distance of sequences to centroids
2229                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2230                 vector<int> centroids;          //the representative flowgram for each cluster m
2231                 vector<double> weight;
2232                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2233                 vector<int> nSeqsBreaks;
2234                 vector<int> nOTUsBreaks;
2235                 
2236                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2237                 
2238                 dist.assign(numSeqs * numOTUs, 0);
2239                 change.assign(numOTUs, 1);
2240                 centroids.assign(numOTUs, -1);
2241                 weight.assign(numOTUs, 0);
2242                 singleTau.assign(numSeqs, 1.0);
2243                 
2244                 nSeqsBreaks.assign(2, 0);
2245                 nOTUsBreaks.assign(2, 0);
2246                 
2247                 nSeqsBreaks[0] = 0;
2248                 nSeqsBreaks[1] = numSeqs;
2249                 nOTUsBreaks[1] = numOTUs;
2250                 
2251                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2252                 
2253                 if (m->control_pressed) { break; }
2254                 
2255                 double maxDelta = 0;
2256                 int iter = 0;
2257                 
2258                 begClock = clock();
2259                 begTime = time(NULL);
2260                 
2261                 m->mothurOut("\nDenoising flowgrams...\n");
2262                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2263                 
2264                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2265                     
2266                     if (m->control_pressed) { break; }
2267                     
2268                     double cycClock = clock();
2269                     unsigned long long cycTime = time(NULL);
2270                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2271                     
2272                     if (m->control_pressed) { break; }
2273                     
2274                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2275                     
2276                     if (m->control_pressed) { break; }
2277                     
2278                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2279                     
2280                     if (m->control_pressed) { break; }
2281                     
2282                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2283                     
2284                     if (m->control_pressed) { break; }
2285                     
2286                     checkCentroids(numOTUs, centroids, weight);
2287                     
2288                     if (m->control_pressed) { break; }
2289                     
2290                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2291                     
2292                     if (m->control_pressed) { break; }
2293                     
2294                     iter++;
2295                     
2296                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2297                     
2298                 }       
2299                 
2300                 if (m->control_pressed) { break; }
2301                 
2302                 m->mothurOut("\nFinalizing...\n");
2303                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2304                 
2305                 if (m->control_pressed) { break; }
2306                 
2307                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2308                 
2309                 if (m->control_pressed) { break; }
2310                 
2311                 vector<int> otuCounts(numOTUs, 0);
2312                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2313                 
2314                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2315                 
2316                 if (m->control_pressed) { break; }
2317                 
2318                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2319                 string thisOutputDir = outputDir;
2320                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2321                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
2322                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
2323                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
2324                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("counts");
2325                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2326                 int pos = fileRoot.find_first_of('.');
2327                 string fileGroup = fileRoot;
2328                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2329                 string groupFileName = thisOutputDir + fileRoot + getOutputFileNameTag("group");
2330
2331                 
2332                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2333                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2334                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2335                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2336                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2337                 
2338                 if (large) {
2339                     if (g > 0) {
2340                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("qfile")));
2341                         m->mothurRemove(qualityFileName);
2342                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("fasta")));
2343                         m->mothurRemove(fastaFileName);
2344                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("name")));
2345                         m->mothurRemove(nameFileName);
2346                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("counts")));
2347                         m->mothurRemove(otuCountsFileName);
2348                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("group")));
2349                         m->mothurRemove(groupFileName);
2350                     }
2351                     m->mothurRemove(theseFlowFileNames[g]);
2352                 }
2353                         }
2354             
2355             numCompleted++;
2356                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2357                 }
2358                 
2359         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2360         
2361         return numCompleted;
2362         
2363     }catch(exception& e) {
2364             m->errorOut(e, "ShhherCommand", "driver");
2365             exit(1);
2366     }
2367 }
2368
2369 /**************************************************************************************************/
2370 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2371         try{
2372        
2373                 ifstream flowFile;
2374        
2375                 m->openInputFile(filename, flowFile);
2376                 
2377                 string seqName;
2378                 int currentNumFlowCells;
2379                 float intensity;
2380         thisSeqNameVector.clear();
2381                 thisLengths.clear();
2382                 thisFlowDataIntI.clear();
2383                 thisNameMap.clear();
2384                 
2385                 flowFile >> numFlowCells;
2386         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2387                 int index = 0;//pcluster
2388                 while(!flowFile.eof()){
2389                         
2390                         if (m->control_pressed) { break; }
2391                         
2392                         flowFile >> seqName >> currentNumFlowCells;
2393             
2394                         thisLengths.push_back(currentNumFlowCells);
2395            
2396                         thisSeqNameVector.push_back(seqName);
2397                         thisNameMap[seqName] = index++;//pcluster
2398             
2399             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2400             
2401                         for(int i=0;i<numFlowCells;i++){
2402                                 flowFile >> intensity;
2403                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2404                                 int intI = int(100 * intensity + 0.0001);
2405                                 thisFlowDataIntI.push_back(intI);
2406                         }
2407                         m->gobble(flowFile);
2408                 }
2409                 flowFile.close();
2410                 
2411                 int numSeqs = thisSeqNameVector.size();         
2412                 
2413                 for(int i=0;i<numSeqs;i++){
2414                         
2415                         if (m->control_pressed) { break; }
2416                         
2417                         int iNumFlowCells = i * numFlowCells;
2418                         for(int j=thisLengths[i];j<numFlowCells;j++){
2419                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2420                         }
2421                 }
2422         
2423         return numSeqs;
2424                 
2425         }
2426         catch(exception& e) {
2427                 m->errorOut(e, "ShhherCommand", "getFlowData");
2428                 exit(1);
2429         }
2430 }
2431 /**************************************************************************************************/
2432
2433 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2434         try{            
2435         
2436                 ostringstream outStream;
2437                 outStream.setf(ios::fixed, ios::floatfield);
2438                 outStream.setf(ios::dec, ios::basefield);
2439                 outStream.setf(ios::showpoint);
2440                 outStream.precision(6);
2441                 
2442                 int begTime = time(NULL);
2443                 double begClock = clock();
2444         
2445                 for(int i=0;i<stopSeq;i++){
2446                         
2447                         if (m->control_pressed) { break; }
2448                         
2449                         for(int j=0;j<i;j++){
2450                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2451                 
2452                                 if(flowDistance < 1e-6){
2453                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2454                                 }
2455                                 else if(flowDistance <= cutoff){
2456                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2457                                 }
2458                         }
2459                         if(i % 100 == 0){
2460                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2461                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2462                                 m->mothurOutEndLine();
2463                         }
2464                 }
2465                 
2466                 ofstream distFile(distFileName.c_str());
2467                 distFile << outStream.str();            
2468                 distFile.close();
2469                 
2470                 if (m->control_pressed) {}
2471                 else {
2472                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2473                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2474                         m->mothurOutEndLine();
2475                 }
2476         
2477         return 0;
2478         }
2479         catch(exception& e) {
2480                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2481                 exit(1);
2482         }
2483 }
2484 /**************************************************************************************************/
2485
2486 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2487         try{
2488                 int minLength = lengths[mapSeqToUnique[seqA]];
2489                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2490                 
2491                 int ANumFlowCells = seqA * numFlowCells;
2492                 int BNumFlowCells = seqB * numFlowCells;
2493                 
2494                 float dist = 0;
2495                 
2496                 for(int i=0;i<minLength;i++){
2497                         
2498                         if (m->control_pressed) { break; }
2499                         
2500                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2501                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2502                         
2503                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2504                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2505                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2506                 }
2507                 
2508                 dist /= (float) minLength;
2509                 return dist;
2510         }
2511         catch(exception& e) {
2512                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2513                 exit(1);
2514         }
2515 }
2516
2517 /**************************************************************************************************/
2518
2519 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2520         try{
2521                 int numUniques = 0;
2522                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2523                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2524                 uniqueLengths.assign(numSeqs, 0);
2525                 mapSeqToUnique.assign(numSeqs, -1);
2526                 mapUniqueToSeq.assign(numSeqs, -1);
2527                 
2528                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2529                 
2530                 for(int i=0;i<numSeqs;i++){
2531                         
2532                         if (m->control_pressed) { break; }
2533                         
2534                         int index = 0;
2535                         
2536                         vector<short> current(numFlowCells);
2537                         for(int j=0;j<numFlowCells;j++){
2538                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2539                         }
2540             
2541                         for(int j=0;j<numUniques;j++){
2542                                 int offset = j * numFlowCells;
2543                                 bool toEnd = 1;
2544                                 
2545                                 int shorterLength;
2546                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2547                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2548                 
2549                                 for(int k=0;k<shorterLength;k++){
2550                                         if(current[k] != uniqueFlowgrams[offset + k]){
2551                                                 toEnd = 0;
2552                                                 break;
2553                                         }
2554                                 }
2555                                 
2556                                 if(toEnd){
2557                                         mapSeqToUnique[i] = j;
2558                                         uniqueCount[j]++;
2559                                         index = j;
2560                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2561                                         break;
2562                                 }
2563                                 index++;
2564                         }
2565                         
2566                         if(index == numUniques){
2567                                 uniqueLengths[numUniques] = lengths[i];
2568                                 uniqueCount[numUniques] = 1;
2569                                 mapSeqToUnique[i] = numUniques;//anMap
2570                                 mapUniqueToSeq[numUniques] = i;//anF
2571                                 
2572                                 for(int k=0;k<numFlowCells;k++){
2573                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2574                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2575                                 }
2576                                 
2577                                 numUniques++;
2578                         }
2579                 }
2580                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2581                 uniqueLengths.resize(numUniques);       
2582                 
2583                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2584                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2585         
2586         return numUniques;
2587         }
2588         catch(exception& e) {
2589                 m->errorOut(e, "ShhherCommand", "getUniques");
2590                 exit(1);
2591         }
2592 }
2593 /**************************************************************************************************/
2594 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2595         try{
2596                 
2597                 vector<string> duplicateNames(numUniques, "");
2598                 for(int i=0;i<numSeqs;i++){
2599                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2600                 }
2601                 
2602                 ofstream nameFile;
2603                 m->openOutputFile(filename, nameFile);
2604                 
2605                 for(int i=0;i<numUniques;i++){
2606                         
2607                         if (m->control_pressed) { break; }
2608                         
2609             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2610                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2611                 }
2612                 
2613                 nameFile.close();
2614         
2615                 return 0;
2616         }
2617         catch(exception& e) {
2618                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2619                 exit(1);
2620         }
2621 }
2622 //**********************************************************************************************************************
2623
2624 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2625         try {
2626                 
2627                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2628                 read->setCutoff(cutoff);
2629                 
2630                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2631                 clusterNameMap->readMap();
2632                 read->read(clusterNameMap);
2633         
2634                 ListVector* list = read->getListVector();
2635                 SparseDistanceMatrix* matrix = read->getDMatrix();
2636                 
2637                 delete read; 
2638                 delete clusterNameMap; 
2639         
2640                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2641                 
2642                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2643                 string tag = cluster->getTag();
2644                 
2645                 double clusterCutoff = cutoff;
2646                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2647                         
2648                         if (m->control_pressed) { break; }
2649                         
2650                         cluster->update(clusterCutoff);
2651                 }
2652                 
2653                 list->setLabel(toString(cutoff));
2654                 
2655                 ofstream listFile;
2656                 m->openOutputFile(filename, listFile);
2657                 list->print(listFile);
2658                 listFile.close();
2659                 
2660                 delete matrix;  delete cluster; delete rabund; delete list;
2661         
2662                 return 0;
2663         }
2664         catch(exception& e) {
2665                 m->errorOut(e, "ShhherCommand", "cluster");
2666                 exit(1);        
2667         }               
2668 }
2669 /**************************************************************************************************/
2670
2671 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2672                                vector<int>& cumNumSeqs,
2673                                vector<int>& nSeqsPerOTU,
2674                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2675                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2676                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2677                                vector<int>& seqIndex,
2678                                map<string, int>& nameMap){
2679         try {
2680         
2681                 ifstream listFile;
2682                 m->openInputFile(fileName, listFile);
2683                 string label;
2684         int numOTUs;
2685                 
2686                 listFile >> label >> numOTUs;
2687         
2688         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2689         
2690                 otuData.assign(numSeqs, 0);
2691                 cumNumSeqs.assign(numOTUs, 0);
2692                 nSeqsPerOTU.assign(numOTUs, 0);
2693                 aaP.clear();aaP.resize(numOTUs);
2694                 
2695                 seqNumber.clear();
2696                 aaI.clear();
2697                 seqIndex.clear();
2698                 
2699                 string singleOTU = "";
2700                 
2701                 for(int i=0;i<numOTUs;i++){
2702                         
2703                         if (m->control_pressed) { break; }
2704             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2705             
2706                         listFile >> singleOTU;
2707                         
2708                         istringstream otuString(singleOTU);
2709             
2710                         while(otuString){
2711                                 
2712                                 string seqName = "";
2713                                 
2714                                 for(int j=0;j<singleOTU.length();j++){
2715                                         char letter = otuString.get();
2716                                         
2717                                         if(letter != ','){
2718                                                 seqName += letter;
2719                                         }
2720                                         else{
2721                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2722                                                 int index = nmIt->second;
2723                                                 
2724                                                 nameMap.erase(nmIt);
2725                                                 
2726                                                 otuData[index] = i;
2727                                                 nSeqsPerOTU[i]++;
2728                                                 aaP[i].push_back(index);
2729                                                 seqName = "";
2730                                         }
2731                                 }
2732                                 
2733                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2734                 
2735                                 int index = nmIt->second;
2736                                 nameMap.erase(nmIt);
2737                 
2738                                 otuData[index] = i;
2739                                 nSeqsPerOTU[i]++;
2740                                 aaP[i].push_back(index);        
2741                                 
2742                                 otuString.get();
2743                         }
2744                         
2745                         sort(aaP[i].begin(), aaP[i].end());
2746                         for(int j=0;j<nSeqsPerOTU[i];j++){
2747                                 seqNumber.push_back(aaP[i][j]);
2748                         }
2749                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2750                                 aaP[i].push_back(0);
2751                         }
2752                         
2753                         
2754                 }
2755                 
2756                 for(int i=1;i<numOTUs;i++){
2757                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2758                 }
2759                 aaI = aaP;
2760                 seqIndex = seqNumber;
2761                 
2762                 listFile.close();       
2763         
2764         return numOTUs;
2765                 
2766         }
2767         catch(exception& e) {
2768                 m->errorOut(e, "ShhherCommand", "getOTUData");
2769                 exit(1);        
2770         }               
2771 }
2772 /**************************************************************************************************/
2773
2774 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2775                                           vector<int>& cumNumSeqs,
2776                                           vector<int>& nSeqsPerOTU,
2777                                           vector<int>& seqIndex,
2778                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2779                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2780                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2781                                           vector<int>& mapSeqToUnique,
2782                                           vector<short>& uniqueFlowgrams,
2783                                           vector<short>& flowDataIntI,
2784                                           vector<int>& lengths,
2785                                           int numFlowCells,
2786                                           vector<int>& seqNumber){                          
2787         
2788         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2789         //within an otu
2790         
2791         try{
2792                 
2793                 for(int i=0;i<numOTUs;i++){
2794                         
2795                         if (m->control_pressed) { break; }
2796                         
2797                         double count = 0;
2798                         int position = 0;
2799                         int minFlowGram = 100000000;
2800                         double minFlowValue = 1e8;
2801                         change[i] = 0; //FALSE
2802                         
2803                         for(int j=0;j<nSeqsPerOTU[i];j++){
2804                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2805                         }
2806             
2807                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2808                                 vector<double> adF(nSeqsPerOTU[i]);
2809                                 vector<int> anL(nSeqsPerOTU[i]);
2810                                 
2811                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2812                                         int index = cumNumSeqs[i] + j;
2813                                         int nI = seqIndex[index];
2814                                         int nIU = mapSeqToUnique[nI];
2815                                         
2816                                         int k;
2817                                         for(k=0;k<position;k++){
2818                                                 if(nIU == anL[k]){
2819                                                         break;
2820                                                 }
2821                                         }
2822                                         if(k == position){
2823                                                 anL[position] = nIU;
2824                                                 adF[position] = 0.0000;
2825                                                 position++;
2826                                         }                                               
2827                                 }
2828                                 
2829                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2830                                         int index = cumNumSeqs[i] + j;
2831                                         int nI = seqIndex[index];
2832                                         
2833                                         double tauValue = singleTau[seqNumber[index]];
2834                                         
2835                                         for(int k=0;k<position;k++){
2836                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2837                                                 adF[k] += dist * tauValue;
2838                                         }
2839                                 }
2840                                 
2841                                 for(int j=0;j<position;j++){
2842                                         if(adF[j] < minFlowValue){
2843                                                 minFlowGram = j;
2844                                                 minFlowValue = adF[j];
2845                                         }
2846                                 }
2847                                 
2848                                 if(centroids[i] != anL[minFlowGram]){
2849                                         change[i] = 1;
2850                                         centroids[i] = anL[minFlowGram];
2851                                 }
2852                         }
2853                         else if(centroids[i] != -1){
2854                                 change[i] = 1;
2855                                 centroids[i] = -1;                      
2856                         }
2857                 }
2858         
2859         return 0;
2860         }
2861         catch(exception& e) {
2862                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2863                 exit(1);        
2864         }               
2865 }
2866 /**************************************************************************************************/
2867
2868 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2869                                         vector<short>& flowDataIntI, int numFlowCells){
2870         try{
2871                 
2872                 int flowAValue = cent * numFlowCells;
2873                 int flowBValue = flow * numFlowCells;
2874                 
2875                 double dist = 0;
2876         
2877                 for(int i=0;i<length;i++){
2878                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2879                         flowAValue++;
2880                         flowBValue++;
2881                 }
2882                 
2883                 return dist / (double)length;
2884         }
2885         catch(exception& e) {
2886                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2887                 exit(1);        
2888         }               
2889 }
2890 /**************************************************************************************************/
2891
2892 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2893         try{
2894                 
2895                 double maxChange = 0;
2896                 
2897                 for(int i=0;i<numOTUs;i++){
2898                         
2899                         if (m->control_pressed) { break; }
2900                         
2901                         double difference = weight[i];
2902                         weight[i] = 0;
2903                         
2904                         for(int j=0;j<nSeqsPerOTU[i];j++){
2905                                 int index = cumNumSeqs[i] + j;
2906                                 double tauValue = singleTau[seqNumber[index]];
2907                                 weight[i] += tauValue;
2908                         }
2909                         
2910                         difference = fabs(weight[i] - difference);
2911                         if(difference > maxChange){     maxChange = difference; }
2912                 }
2913                 return maxChange;
2914         }
2915         catch(exception& e) {
2916                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2917                 exit(1);        
2918         }               
2919 }
2920
2921 /**************************************************************************************************/
2922
2923 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2924         
2925         try{
2926                 
2927                 vector<long double> P(numSeqs, 0);
2928                 int effNumOTUs = 0;
2929                 
2930                 for(int i=0;i<numOTUs;i++){
2931                         if(weight[i] > MIN_WEIGHT){
2932                                 effNumOTUs++;
2933                         }
2934                 }
2935                 
2936                 string hold;
2937                 for(int i=0;i<numOTUs;i++){
2938                         
2939                         if (m->control_pressed) { break; }
2940                         
2941                         for(int j=0;j<nSeqsPerOTU[i];j++){
2942                                 int index = cumNumSeqs[i] + j;
2943                                 int nI = seqIndex[index];
2944                                 double singleDist = dist[seqNumber[index]];
2945                                 
2946                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2947                         }
2948                 }
2949                 double nLL = 0.00;
2950                 for(int i=0;i<numSeqs;i++){
2951                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2952             
2953                         nLL += -log(P[i]);
2954                 }
2955                 
2956                 nLL = nLL -(double)numSeqs * log(sigma);
2957         
2958                 return nLL; 
2959         }
2960         catch(exception& e) {
2961                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2962                 exit(1);        
2963         }               
2964 }
2965
2966 /**************************************************************************************************/
2967
2968 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2969         try{
2970                 vector<int> unique(numOTUs, 1);
2971                 
2972                 for(int i=0;i<numOTUs;i++){
2973                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2974                                 unique[i] = -1;
2975                         }
2976                 }
2977                 
2978                 for(int i=0;i<numOTUs;i++){
2979                         
2980                         if (m->control_pressed) { break; }
2981                         
2982                         if(unique[i] == 1){
2983                                 for(int j=i+1;j<numOTUs;j++){
2984                                         if(unique[j] == 1){
2985                                                 
2986                                                 if(centroids[j] == centroids[i]){
2987                                                         unique[j] = 0;
2988                                                         centroids[j] = -1;
2989                                                         
2990                                                         weight[i] += weight[j];
2991                                                         weight[j] = 0.0;
2992                                                 }
2993                                         }
2994                                 }
2995                         }
2996                 }
2997         
2998         return 0;
2999         }
3000         catch(exception& e) {
3001                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3002                 exit(1);        
3003         }               
3004 }
3005 /**************************************************************************************************/
3006
3007 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3008                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3009                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3010                                      vector<int>& seqNumber, vector<int>& seqIndex,
3011                                      vector<short>& uniqueFlowgrams,
3012                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3013         
3014         try{
3015                 
3016                 int total = 0;
3017                 vector<double> newTau(numOTUs,0);
3018                 vector<double> norms(numSeqs, 0);
3019                 nSeqsPerOTU.assign(numOTUs, 0);
3020         
3021                 for(int i=0;i<numSeqs;i++){
3022                         
3023                         if (m->control_pressed) { break; }
3024                         
3025                         int indexOffset = i * numOTUs;
3026             
3027                         double offset = 1e8;
3028                         
3029                         for(int j=0;j<numOTUs;j++){
3030                 
3031                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3032                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3033                                 }
3034                 
3035                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3036                                         offset = dist[indexOffset + j];
3037                                 }
3038                         }
3039             
3040                         for(int j=0;j<numOTUs;j++){
3041                                 if(weight[j] > MIN_WEIGHT){
3042                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3043                                         norms[i] += newTau[j];
3044                                 }
3045                                 else{
3046                                         newTau[j] = 0.0;
3047                                 }
3048                         }
3049             
3050                         for(int j=0;j<numOTUs;j++){
3051                                 newTau[j] /= norms[i];
3052                         }
3053             
3054                         for(int j=0;j<numOTUs;j++){
3055                                 if(newTau[j] > MIN_TAU){
3056                                         
3057                                         int oldTotal = total;
3058                                         
3059                                         total++;
3060                                         
3061                                         singleTau.resize(total, 0);
3062                                         seqNumber.resize(total, 0);
3063                                         seqIndex.resize(total, 0);
3064                                         
3065                                         singleTau[oldTotal] = newTau[j];
3066                                         
3067                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3068                                         aaI[j][nSeqsPerOTU[j]] = i;
3069                                         nSeqsPerOTU[j]++;
3070                                 }
3071                         }
3072             
3073                 }
3074         
3075         }
3076         catch(exception& e) {
3077                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3078                 exit(1);        
3079         }               
3080 }
3081 /**************************************************************************************************/
3082
3083 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3084         try {
3085                 int index = 0;
3086                 for(int i=0;i<numOTUs;i++){
3087                         
3088                         if (m->control_pressed) { return 0; }
3089                         
3090                         cumNumSeqs[i] = index;
3091                         for(int j=0;j<nSeqsPerOTU[i];j++){
3092                                 seqNumber[index] = aaP[i][j];
3093                                 seqIndex[index] = aaI[i][j];
3094                                 
3095                                 index++;
3096                         }
3097                 }
3098         
3099         return 0; 
3100         }
3101         catch(exception& e) {
3102                 m->errorOut(e, "ShhherCommand", "fill");
3103                 exit(1);        
3104         }               
3105 }
3106 /**************************************************************************************************/
3107
3108 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3109                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3110         
3111         try {
3112                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3113                 
3114                 for(int i=0;i<numOTUs;i++){
3115                         
3116                         if (m->control_pressed) { break; }
3117                         
3118                         for(int j=0;j<nSeqsPerOTU[i];j++){
3119                                 int index = cumNumSeqs[i] + j;
3120                                 double tauValue = singleTau[seqNumber[index]];
3121                                 int sIndex = seqIndex[index];
3122                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3123                         }
3124                 }
3125                 
3126                 for(int i=0;i<numSeqs;i++){
3127                         double maxTau = -1.0000;
3128                         int maxOTU = -1;
3129                         for(int j=0;j<numOTUs;j++){
3130                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3131                                         maxTau = bigTauMatrix[i * numOTUs + j];
3132                                         maxOTU = j;
3133                                 }
3134                         }
3135                         
3136                         otuData[i] = maxOTU;
3137                 }
3138                 
3139                 nSeqsPerOTU.assign(numOTUs, 0);         
3140                 
3141                 for(int i=0;i<numSeqs;i++){
3142                         int index = otuData[i];
3143                         
3144                         singleTau[i] = 1.0000;
3145                         dist[i] = 0.0000;
3146                         
3147                         aaP[index][nSeqsPerOTU[index]] = i;
3148                         aaI[index][nSeqsPerOTU[index]] = i;
3149                         
3150                         nSeqsPerOTU[index]++;
3151                 }
3152         
3153                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3154         }
3155         catch(exception& e) {
3156                 m->errorOut(e, "ShhherCommand", "setOTUs");
3157                 exit(1);        
3158         }               
3159 }
3160 /**************************************************************************************************/
3161
3162 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3163                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3164                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3165         
3166         try {
3167         
3168                 ofstream qualityFile;
3169                 m->openOutputFile(qualityFileName, qualityFile);
3170         
3171                 qualityFile.setf(ios::fixed, ios::floatfield);
3172                 qualityFile.setf(ios::showpoint);
3173                 qualityFile << setprecision(6);
3174                 
3175                 vector<vector<int> > qualities(numOTUs);
3176                 vector<double> pr(HOMOPS, 0);
3177                 
3178                 
3179                 for(int i=0;i<numOTUs;i++){
3180                         
3181                         if (m->control_pressed) { break; }
3182                         
3183                         int index = 0;
3184                         int base = 0;
3185                         
3186                         if(nSeqsPerOTU[i] > 0){
3187                                 qualities[i].assign(1024, -1);
3188                                 
3189                                 while(index < numFlowCells){
3190                                         double maxPrValue = 1e8;
3191                                         short maxPrIndex = -1;
3192                                         double count = 0.0000;
3193                                         
3194                                         pr.assign(HOMOPS, 0);
3195                                         
3196                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3197                                                 int lIndex = cumNumSeqs[i] + j;
3198                                                 double tauValue = singleTau[seqNumber[lIndex]];
3199                                                 int sequenceIndex = aaI[i][j];
3200                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3201                                                 
3202                                                 count += tauValue;
3203                                                 
3204                                                 for(int s=0;s<HOMOPS;s++){
3205                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3206                                                 }
3207                                         }
3208                                         
3209                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3210                                         maxPrValue = pr[maxPrIndex];
3211                                         
3212                                         if(count > MIN_COUNT){
3213                                                 double U = 0.0000;
3214                                                 double norm = 0.0000;
3215                                                 
3216                                                 for(int s=0;s<HOMOPS;s++){
3217                                                         norm += exp(-(pr[s] - maxPrValue));
3218                                                 }
3219                                                 
3220                                                 for(int s=1;s<=maxPrIndex;s++){
3221                                                         int value = 0;
3222                                                         double temp = 0.0000;
3223                                                         
3224                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3225                                                         
3226                                                         if(U>0.00){
3227                                                                 temp = log10(U);
3228                                                         }
3229                                                         else{
3230                                                                 temp = -10.1;
3231                                                         }
3232                                                         temp = floor(-10 * temp);
3233                                                         value = (int)floor(temp);
3234                                                         if(value > 100){        value = 100;    }
3235                                                         
3236                                                         qualities[i][base] = (int)value;
3237                                                         base++;
3238                                                 }
3239                                         }
3240                                         
3241                                         index++;
3242                                 }
3243                         }
3244                         
3245                         
3246                         if(otuCounts[i] > 0){
3247                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3248                         
3249                                 int j=4;        //need to get past the first four bases
3250                                 while(qualities[i][j] != -1){
3251                     qualityFile << qualities[i][j] << ' ';
3252                     if (j > qualities[i].size()) { break; }
3253                     j++;
3254                                 }
3255                                 qualityFile << endl;
3256                         }
3257                 }
3258                 qualityFile.close();
3259                 outputNames.push_back(qualityFileName);
3260         
3261         }
3262         catch(exception& e) {
3263                 m->errorOut(e, "ShhherCommand", "writeQualities");
3264                 exit(1);        
3265         }               
3266 }
3267
3268 /**************************************************************************************************/
3269
3270 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3271         try {
3272                 
3273                 ofstream fastaFile;
3274                 m->openOutputFile(fastaFileName, fastaFile);
3275                 
3276                 vector<string> names(numOTUs, "");
3277                 
3278                 for(int i=0;i<numOTUs;i++){
3279                         
3280                         if (m->control_pressed) { break; }
3281                         
3282                         int index = centroids[i];
3283                         
3284                         if(otuCounts[i] > 0){
3285                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3286                                 
3287                                 string newSeq = "";
3288                                 
3289                                 for(int j=0;j<numFlowCells;j++){
3290                                         
3291                                         char base = flowOrder[j % 4];
3292                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3293                                                 newSeq += base;
3294                                         }
3295                                 }
3296                                 
3297                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3298                 else {  fastaFile << "NNNN" << endl;  }
3299                         }
3300                 }
3301                 fastaFile.close();
3302         
3303                 outputNames.push_back(fastaFileName);
3304         
3305                 if(thisCompositeFASTAFileName != ""){
3306                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3307                 }
3308         }
3309         catch(exception& e) {
3310                 m->errorOut(e, "ShhherCommand", "writeSequences");
3311                 exit(1);        
3312         }               
3313 }
3314
3315 /**************************************************************************************************/
3316
3317 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3318         try {
3319                 
3320                 ofstream nameFile;
3321                 m->openOutputFile(nameFileName, nameFile);
3322                 
3323                 for(int i=0;i<numOTUs;i++){
3324                         
3325                         if (m->control_pressed) { break; }
3326                         
3327                         if(otuCounts[i] > 0){
3328                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3329                                 
3330                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3331                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3332                                 }
3333                                 
3334                                 nameFile << endl;
3335                         }
3336                 }
3337                 nameFile.close();
3338                 outputNames.push_back(nameFileName);
3339                 
3340                 
3341                 if(thisCompositeNamesFileName != ""){
3342                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3343                 }               
3344         }
3345         catch(exception& e) {
3346                 m->errorOut(e, "ShhherCommand", "writeNames");
3347                 exit(1);        
3348         }               
3349 }
3350
3351 /**************************************************************************************************/
3352
3353 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3354         try {
3355         ofstream groupFile;
3356                 m->openOutputFile(groupFileName, groupFile);
3357                 
3358                 for(int i=0;i<numSeqs;i++){
3359                         if (m->control_pressed) { break; }
3360                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3361                 }
3362                 groupFile.close();
3363                 outputNames.push_back(groupFileName);
3364         
3365         }
3366         catch(exception& e) {
3367                 m->errorOut(e, "ShhherCommand", "writeGroups");
3368                 exit(1);        
3369         }               
3370 }
3371
3372 /**************************************************************************************************/
3373
3374 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3375         try {
3376                 ofstream otuCountsFile;
3377                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3378                 
3379                 string bases = flowOrder;
3380                 
3381                 for(int i=0;i<numOTUs;i++){
3382                         
3383                         if (m->control_pressed) {
3384                                 break;
3385                         }
3386                         //output the translated version of the centroid sequence for the otu
3387                         if(otuCounts[i] > 0){
3388                                 int index = centroids[i];
3389                                 
3390                                 otuCountsFile << "ideal\t";
3391                                 for(int j=8;j<numFlowCells;j++){
3392                                         char base = bases[j % 4];
3393                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3394                                                 otuCountsFile << base;
3395                                         }
3396                                 }
3397                                 otuCountsFile << endl;
3398                                 
3399                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3400                                         int sequence = aaI[i][j];
3401                                         otuCountsFile << seqNameVector[sequence] << '\t';
3402                                         
3403                                         string newSeq = "";
3404                                         
3405                                         for(int k=0;k<lengths[sequence];k++){
3406                                                 char base = bases[k % 4];
3407                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3408                         
3409                                                 for(int s=0;s<freq;s++){
3410                                                         newSeq += base;
3411                                                         //otuCountsFile << base;
3412                                                 }
3413                                         }
3414                                         
3415                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3416                     else {  otuCountsFile << "NNNN" << endl;  }
3417                                 }
3418                                 otuCountsFile << endl;
3419                         }
3420                 }
3421                 otuCountsFile.close();
3422                 outputNames.push_back(otuCountsFileName);
3423         
3424         }
3425         catch(exception& e) {
3426                 m->errorOut(e, "ShhherCommand", "writeClusters");
3427                 exit(1);        
3428         }               
3429 }
3430
3431 /**************************************************************************************************/
3432
3433 void ShhherCommand::getSingleLookUp(){
3434         try{
3435                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3436                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3437                 
3438                 int index = 0;
3439                 ifstream lookUpFile;
3440                 m->openInputFile(lookupFileName, lookUpFile);
3441                 
3442                 for(int i=0;i<HOMOPS;i++){
3443                         
3444                         if (m->control_pressed) { break; }
3445                         
3446                         float logFracFreq;
3447                         lookUpFile >> logFracFreq;
3448                         
3449                         for(int j=0;j<NUMBINS;j++)      {
3450                                 lookUpFile >> singleLookUp[index];
3451                                 index++;
3452                         }
3453                 }       
3454                 lookUpFile.close();
3455         }
3456         catch(exception& e) {
3457                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3458                 exit(1);
3459         }
3460 }
3461
3462 /**************************************************************************************************/
3463
3464 void ShhherCommand::getJointLookUp(){
3465         try{
3466                 
3467                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3468                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3469                 
3470                 for(int i=0;i<NUMBINS;i++){
3471                         
3472                         if (m->control_pressed) { break; }
3473                         
3474                         for(int j=0;j<NUMBINS;j++){             
3475                                 
3476                                 double minSum = 100000000;
3477                                 
3478                                 for(int k=0;k<HOMOPS;k++){
3479                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3480                                         
3481                                         if(sum < minSum)        {       minSum = sum;           }
3482                                 }       
3483                                 jointLookUp[i * NUMBINS + j] = minSum;
3484                         }
3485                 }
3486         }
3487         catch(exception& e) {
3488                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3489                 exit(1);
3490         }
3491 }
3492
3493 /**************************************************************************************************/
3494
3495 double ShhherCommand::getProbIntensity(int intIntensity){                          
3496         try{
3497
3498                 double minNegLogProb = 100000000; 
3499
3500                 
3501                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3502                         
3503                         if (m->control_pressed) { break; }
3504                         
3505                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3506                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3507                 }
3508                 
3509                 return minNegLogProb;
3510         }
3511         catch(exception& e) {
3512                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3513                 exit(1);
3514         }
3515 }
3516
3517
3518
3519