]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added load.logfile command. changed summary.single output for subsample=t.
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
24                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
25                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
26                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
27                 
28                 vector<string> myArray;
29                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
30                 return myArray;
31         }
32         catch(exception& e) {
33                 m->errorOut(e, "ShhherCommand", "setParameters");
34                 exit(1);
35         }
36 }
37 //**********************************************************************************************************************
38 string ShhherCommand::getHelpString(){  
39         try {
40                 string helpString = "";
41                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
42                 return helpString;
43         }
44         catch(exception& e) {
45                 m->errorOut(e, "ShhherCommand", "getHelpString");
46                 exit(1);
47         }
48 }
49 //**********************************************************************************************************************
50 string ShhherCommand::getOutputFileNameTag(string type, string inputName=""){   
51         try {
52         string outputFileName = "";
53                 map<string, vector<string> >::iterator it;
54         
55         //is this a type this command creates
56         it = outputTypes.find(type);
57         if (it == outputTypes.end()) {  m->mothurOut("[ERROR]: this command doesn't create a " + type + " output file.\n"); }
58         else {
59             if (type == "fasta")            {   outputFileName =  "shhh.fasta";   }
60             else if (type == "name")    {   outputFileName =  "shhh.names";   }
61             else if (type == "group")        {   outputFileName =  "shhh.groups";   }
62             else if (type == "counts")        {   outputFileName =  "shhh.counts";   }
63             else if (type == "qfile")        {   outputFileName =  "shhh.qual";   }
64             else { m->mothurOut("[ERROR]: No definition for type " + type + " output file tag.\n"); m->control_pressed = true;  }
65         }
66         return outputFileName;
67         }
68         catch(exception& e) {
69                 m->errorOut(e, "ShhherCommand", "getOutputFileNameTag");
70                 exit(1);
71         }
72 }
73 //**********************************************************************************************************************
74
75 ShhherCommand::ShhherCommand(){ 
76         try {
77                 abort = true; calledHelp = true;
78                 setParameters();
79                 
80                 //initialize outputTypes
81                 vector<string> tempOutNames;
82                 outputTypes["fasta"] = tempOutNames;
83         outputTypes["name"] = tempOutNames;
84         outputTypes["group"] = tempOutNames;
85         outputTypes["counts"] = tempOutNames;
86         outputTypes["qfile"] = tempOutNames;
87
88         }
89         catch(exception& e) {
90                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
91                 exit(1);
92         }
93 }
94
95 //**********************************************************************************************************************
96
97 ShhherCommand::ShhherCommand(string option) {
98         try {
99         
100 #ifdef USE_MPI
101                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
102                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
103         
104                 if(pid == 0){
105 #endif
106                 abort = false; calledHelp = false;   
107                 
108                 //allow user to run help
109                 if(option == "help") { help(); abort = true; calledHelp = true; }
110                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
111                 
112                 else {
113                         vector<string> myArray = setParameters();
114                         
115                         OptionParser parser(option);
116                         map<string,string> parameters = parser.getParameters();
117                         
118                         ValidParameters validParameter;
119                         map<string,string>::iterator it;
120                         
121                         //check to make sure all parameters are valid for command
122                         for (it = parameters.begin(); it != parameters.end(); it++) { 
123                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
124                         }
125                         
126                         //initialize outputTypes
127             vector<string> tempOutNames;
128             outputTypes["fasta"] = tempOutNames;
129             outputTypes["name"] = tempOutNames;
130             outputTypes["group"] = tempOutNames;
131             outputTypes["counts"] = tempOutNames;
132             outputTypes["qfile"] = tempOutNames;
133
134                         
135                         //if the user changes the input directory command factory will send this info to us in the output parameter 
136                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
137                         if (inputDir == "not found"){   inputDir = "";          }
138                         else {
139                                 string path;
140                                 it = parameters.find("flow");
141                                 //user has given a template file
142                                 if(it != parameters.end()){ 
143                                         path = m->hasPath(it->second);
144                                         //if the user has not given a path then, add inputdir. else leave path alone.
145                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
146                                 }
147                                 
148                                 it = parameters.find("lookup");
149                                 //user has given a template file
150                                 if(it != parameters.end()){ 
151                                         path = m->hasPath(it->second);
152                                         //if the user has not given a path then, add inputdir. else leave path alone.
153                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
154                                 }
155
156                                 it = parameters.find("file");
157                                 //user has given a template file
158                                 if(it != parameters.end()){ 
159                                         path = m->hasPath(it->second);
160                                         //if the user has not given a path then, add inputdir. else leave path alone.
161                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
162                                 }
163                         }
164             
165             //if the user changes the output directory command factory will send this info to us in the output parameter 
166                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
167             
168                         //check for required parameters
169                         flowFileName = validParameter.validFile(parameters, "flow", true);
170                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
171                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
172                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
173                                 m->mothurOutEndLine();
174                                 abort = true; 
175                         }
176                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
177                         
178                         if(flowFileName != "not found"){
179                                 compositeFASTAFileName = "";    
180                                 compositeNamesFileName = "";    
181                         }
182                         else{
183                                 ofstream temp;
184                 
185                 string thisoutputDir = outputDir;
186                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
187                 
188                                 //we want to rip off .files, and also .flow if its there
189                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
190                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
191                 string extension = m->getExtension(fileroot);
192                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
193                 else { fileroot += "."; } //add back if needed
194                 
195                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
196                                 m->openOutputFile(compositeFASTAFileName, temp);
197                                 temp.close();
198                                 
199                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
200                                 m->openOutputFile(compositeNamesFileName, temp);
201                                 temp.close();
202                         }
203             
204             if(flowFilesFileName != "not found"){
205                 string fName;
206                 
207                 ifstream flowFilesFile;
208                 m->openInputFile(flowFilesFileName, flowFilesFile);
209                 while(flowFilesFile){
210                     fName = m->getline(flowFilesFile);
211                     
212                     //test if file is valid
213                     ifstream in;
214                     int ableToOpen = m->openInputFile(fName, in, "noerror");
215                     in.close(); 
216                     if (ableToOpen == 1) {
217                         if (inputDir != "") { //default path is set
218                             string tryPath = inputDir + fName;
219                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
220                             ifstream in2;
221                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
222                             in2.close();
223                             fName = tryPath;
224                         }
225                     }
226                     
227                     if (ableToOpen == 1) {
228                         if (m->getDefaultPath() != "") { //default path is set
229                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
230                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
231                             ifstream in2;
232                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
233                             in2.close();
234                             fName = tryPath;
235                         }
236                     }
237                     
238                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
239                     if (ableToOpen == 1) {
240                         string exepath = m->argv;
241                         string tempPath = exepath;
242                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
243                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
244                         
245                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
246                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
247                         ifstream in2;
248                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
249                         in2.close();
250                         fName = tryPath;
251                     }
252                     
253                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
254                     else { flowFileVector.push_back(fName); }
255                     m->gobble(flowFilesFile);
256                 }
257                 flowFilesFile.close();
258                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
259             }
260             else{
261                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
262                 flowFileVector.push_back(flowFileName);
263             }
264                 
265                         //check for optional parameter and set defaults
266                         // ...at some point should added some additional type checking...
267                         string temp;
268                         temp = validParameter.validFile(parameters, "lookup", true);
269                         if (temp == "not found")        {       
270                                 lookupFileName = "LookUp_Titanium.pat"; 
271                                 
272                                 int ableToOpen;
273                                 ifstream in;
274                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
275                                 in.close();     
276                                 
277                                 //if you can't open it, try input location
278                                 if (ableToOpen == 1) {
279                                         if (inputDir != "") { //default path is set
280                                                 string tryPath = inputDir + lookupFileName;
281                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
282                                                 ifstream in2;
283                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
284                                                 in2.close();
285                                                 lookupFileName = tryPath;
286                                         }
287                                 }
288                                 
289                                 //if you can't open it, try default location
290                                 if (ableToOpen == 1) {
291                                         if (m->getDefaultPath() != "") { //default path is set
292                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
293                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
294                                                 ifstream in2;
295                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
296                                                 in2.close();
297                                                 lookupFileName = tryPath;
298                                         }
299                                 }
300                                 
301                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
302                                 if (ableToOpen == 1) {
303                                         string exepath = m->argv;
304                                         string tempPath = exepath;
305                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
306                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
307                                         
308                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
309                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
310                                         ifstream in2;
311                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
312                                         in2.close();
313                                         lookupFileName = tryPath;
314                                 }
315                                 
316                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
317                         }
318                         else if(temp == "not open")     {       
319                                 
320                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
321                                 
322                                 //if you can't open it its not inputDir, try mothur excutable location
323                                 string exepath = m->argv;
324                                 string tempPath = exepath;
325                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
326                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
327                                         
328                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
329                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
330                                 ifstream in2;
331                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
332                                 in2.close();
333                                 lookupFileName = tryPath;
334                                 
335                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
336                         }else                                           {       lookupFileName = temp;  }
337                         
338                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
339                         m->setProcessors(temp);
340                         m->mothurConvert(temp, processors);
341
342                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
343                         m->mothurConvert(temp, cutoff); 
344                         
345                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
346                         m->mothurConvert(temp, minDelta); 
347
348                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
349                         m->mothurConvert(temp, maxIters); 
350             
351             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
352                         m->mothurConvert(temp, largeSize); 
353             if (largeSize != 0) { large = true; }
354             else { large = false;  }
355             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
356             
357 #ifdef USE_MPI
358             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
359 #endif
360             
361
362                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
363                         m->mothurConvert(temp, sigma); 
364                         
365                         flowOrder = validParameter.validFile(parameters, "order", false);
366                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
367                         else if(flowOrder.length() != 4){
368                                 m->mothurOut("The value of the order option must be four bases long\n");
369                         }
370                         
371                 }
372 #ifdef USE_MPI
373                 }                               
374 #endif
375         }
376         catch(exception& e) {
377                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
378                 exit(1);
379         }
380 }
381 //**********************************************************************************************************************
382 #ifdef USE_MPI
383 int ShhherCommand::execute(){
384         try {
385                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
386                 
387                 int tag = 1976;
388                 MPI_Status status; 
389                         
390                 if(pid == 0){
391
392                         for(int i=1;i<ncpus;i++){
393                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
394                         }
395                         if(abort == 1){ return 0;       }
396
397                         processors = ncpus;
398                         
399                         m->mothurOut("\nGetting preliminary data...\n");
400                         getSingleLookUp();      if (m->control_pressed) { return 0; }
401                         getJointLookUp();       if (m->control_pressed) { return 0; }
402                         
403             vector<string> flowFileVector;
404                         if(flowFilesFileName != "not found"){
405                                 string fName;
406                 
407                                 ifstream flowFilesFile;
408                                 m->openInputFile(flowFilesFileName, flowFilesFile);
409                                 while(flowFilesFile){
410                                         fName = m->getline(flowFilesFile);
411                                         flowFileVector.push_back(fName);
412                                         m->gobble(flowFilesFile);
413                                 }
414                         }
415                         else{
416                                 flowFileVector.push_back(flowFileName);
417                         }
418             
419                         int numFiles = flowFileVector.size();
420
421                         for(int i=1;i<ncpus;i++){
422                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
423                         }
424                         
425                         for(int i=0;i<numFiles;i++){
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 double begClock = clock();
430                                 unsigned long long begTime = time(NULL);
431
432                                 flowFileName = flowFileVector[i];
433                                 
434                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
435                                 m->mothurOut("Reading flowgrams...\n");
436                                 getFlowData();
437                                 
438                                 if (m->control_pressed) { break; }
439
440                                 m->mothurOut("Identifying unique flowgrams...\n");
441                                 getUniques();
442                                 
443                                 if (m->control_pressed) { break; }
444
445                                 m->mothurOut("Calculating distances between flowgrams...\n");
446                                 char fileName[1024];
447                                 strcpy(fileName, flowFileName.c_str());
448
449                                 for(int i=1;i<ncpus;i++){
450                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
451
452                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
453                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
454                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
455                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
456                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
457                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
458                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
459                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
460                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
462                                 }                               
463                                                         
464                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
465                                 
466                                 if (m->control_pressed) { break; }
467                                 
468                                 int done;
469                                 for(int i=1;i<ncpus;i++){
470                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
471                                         
472                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
473                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
474                                 }
475
476                                 string namesFileName = createNamesFile();
477                                 
478                                 if (m->control_pressed) { break; }
479                                 
480                                 m->mothurOut("\nClustering flowgrams...\n");
481                                 string listFileName = cluster(distFileName, namesFileName);
482                                 
483                                 if (m->control_pressed) { break; }
484                                 
485                 
486                 
487                                 getOTUData(listFileName);
488
489                                 m->mothurRemove(distFileName);
490                                 m->mothurRemove(namesFileName);
491                                 m->mothurRemove(listFileName);
492                                 
493                                 if (m->control_pressed) { break; }
494                                 
495                                 initPyroCluster();
496
497                                 if (m->control_pressed) { break; }
498                                 
499             
500                                 for(int i=1;i<ncpus;i++){
501                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
502                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
503                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
504                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
505                                 }
506                                 
507                                 if (m->control_pressed) { break; }
508                                 
509                                 double maxDelta = 0;
510                                 int iter = 0;
511                                 
512                                 int numOTUsOnCPU = numOTUs / ncpus;
513                                 int numSeqsOnCPU = numSeqs / ncpus;
514                                 m->mothurOut("\nDenoising flowgrams...\n");
515                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
516                                 
517                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
518                                         
519                                         double cycClock = clock();
520                                         unsigned long long cycTime = time(NULL);
521                                         fill();
522                                         
523                                         if (m->control_pressed) { break; }
524
525                                         int total = singleTau.size();
526                                         for(int i=1;i<ncpus;i++){
527                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
528                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
529                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
530                                                 
531                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
532                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
533                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
534                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
535                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
536                                         }
537                                         
538                                         calcCentroidsDriver(0, numOTUsOnCPU);
539                                         
540                                         for(int i=1;i<ncpus;i++){
541                                                 int otuStart = i * numOTUs / ncpus;
542                                                 int otuStop = (i + 1) * numOTUs / ncpus;
543                                                 
544                                                 vector<int> tempCentroids(numOTUs, 0);
545                                                 vector<short> tempChange(numOTUs, 0);
546                                                 
547                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
548                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
549                                                 
550                                                 for(int j=otuStart;j<otuStop;j++){
551                                                         centroids[j] = tempCentroids[j];
552                                                         change[j] = tempChange[j];
553                                                 }
554                                         }
555                                                                         
556                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
557                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
558                                         checkCentroids(); if (m->control_pressed) { break; }
559                                         
560                                         for(int i=1;i<ncpus;i++){
561                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
562                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
563                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
564                                         }
565                                         
566                                         calcNewDistancesParent(0, numSeqsOnCPU);
567
568                                         total = singleTau.size();
569
570                                         for(int i=1;i<ncpus;i++){
571                                                 int childTotal;
572                                                 int seqStart = i * numSeqs / ncpus;
573                                                 int seqStop = (i + 1) * numSeqs / ncpus;
574                                                 
575                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
576
577                                                 vector<int> childSeqIndex(childTotal, 0);
578                                                 vector<double> childSingleTau(childTotal, 0);
579                                                 vector<double> childDist(numSeqs * numOTUs, 0);
580                                                 vector<int> otuIndex(childTotal, 0);
581                                                 
582                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
583                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
584                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
585                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
586
587                                                 int oldTotal = total;
588                                                 total += childTotal;
589                                                 singleTau.resize(total, 0);
590                                                 seqIndex.resize(total, 0);
591                                                 seqNumber.resize(total, 0);
592                                                 
593                                                 int childIndex = 0;
594                                                 
595                                                 for(int j=oldTotal;j<total;j++){
596                                                         int otuI = otuIndex[childIndex];
597                                                         int seqI = childSeqIndex[childIndex];
598                                                         
599                                                         singleTau[j] = childSingleTau[childIndex];
600                                                         
601                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
602                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
603                                                         nSeqsPerOTU[otuI]++;
604                                                         childIndex++;
605                                                 }
606                                                 
607                                                 int index = seqStart * numOTUs;
608                                                 for(int j=seqStart;j<seqStop;j++){
609                                                         for(int k=0;k<numOTUs;k++){
610                                                                 dist[index] = childDist[index];
611                                                                 index++;
612                                                         }
613                                                 }                                       
614                                         }
615                                         
616                                         iter++;
617                                         
618                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
619
620                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
621                                                 int live = 1;
622                                                 for(int i=1;i<ncpus;i++){
623                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
624                                                 }
625                                         }
626                                         else{
627                                                 int live = 0;
628                                                 for(int i=1;i<ncpus;i++){
629                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
630                                                 }
631                                         }
632                                         
633                                 }       
634                                 
635                                 if (m->control_pressed) { break; }
636                                 
637                                 m->mothurOut("\nFinalizing...\n");
638                                 fill();
639                                 
640                                 if (m->control_pressed) { break; }
641                                 
642                                 setOTUs();
643                                 
644                                 vector<int> otuCounts(numOTUs, 0);
645                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
646                                 calcCentroidsDriver(0, numOTUs);
647                                 
648                                 if (m->control_pressed) { break; }
649                                 
650                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
651                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
652                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
653                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
654                                 writeGroups();                          if (m->control_pressed) { break; }
655                                 
656                                                                  
657                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
658                         }
659                 }
660                 else{
661                         int abort = 1;
662
663                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
664                         if(abort){      return 0;       }
665
666                         int numFiles;
667                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
668
669                         for(int i=0;i<numFiles;i++){
670                                 
671                                 if (m->control_pressed) { break; }
672                                 
673                                 //Now into the pyrodist part
674                                 bool live = 1;
675
676                                 char fileName[1024];
677                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
678                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
679                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
680                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
681                                 
682                                 flowDataIntI.resize(numSeqs * numFlowCells);
683                                 flowDataPrI.resize(numSeqs * numFlowCells);
684                                 mapUniqueToSeq.resize(numSeqs);
685                                 mapSeqToUnique.resize(numSeqs);
686                                 lengths.resize(numSeqs);
687                                 jointLookUp.resize(NUMBINS * NUMBINS);
688
689                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
690                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
693                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
694                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
695                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
696
697                                 flowFileName = string(fileName);
698                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
699                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
700                                 
701                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
702                                 
703                                 if (m->control_pressed) { break; }
704                                 
705                                 int done = 1;
706                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
707
708                                 //Now into the pyronoise part
709                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
710                                 
711                                 singleLookUp.resize(HOMOPS * NUMBINS);
712                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
713                                 weight.resize(numOTUs);
714                                 centroids.resize(numOTUs);
715                                 change.resize(numOTUs);
716                                 dist.assign(numOTUs * numSeqs, 0);
717                                 nSeqsPerOTU.resize(numOTUs);
718                                 cumNumSeqs.resize(numOTUs);
719
720                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
721                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
722                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
723                                 
724                                 int startOTU = pid * numOTUs / ncpus;
725                                 int endOTU = (pid + 1) * numOTUs / ncpus;
726
727                                 int startSeq = pid * numSeqs / ncpus;
728                                 int endSeq = (pid + 1) * numSeqs /ncpus;
729                                 
730                                 int total;
731
732                                 while(live){
733                                         
734                                         if (m->control_pressed) { break; }
735                                         
736                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
737                                         singleTau.assign(total, 0.0000);
738                                         seqNumber.assign(total, 0);
739                                         seqIndex.assign(total, 0);
740
741                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
742                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
743                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
744                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
745                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
746                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
747                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
748
749                                         calcCentroidsDriver(startOTU, endOTU);
750
751                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
752                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
753
754                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
755                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
756                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
757
758                                         vector<int> otuIndex(total, 0);
759                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
760                                         total = otuIndex.size();
761                                         
762                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
763                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
764                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
765                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
766                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
767                                 
768                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
769                                 }
770                         }
771                 }
772                 
773                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
774                 
775                 MPI_Barrier(MPI_COMM_WORLD);
776
777                 
778                 if(compositeFASTAFileName != ""){
779                         outputNames.push_back(compositeFASTAFileName);
780                         outputNames.push_back(compositeNamesFileName);
781                 }
782
783                 m->mothurOutEndLine();
784                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
785                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
786                 m->mothurOutEndLine();
787                 
788                 return 0;
789
790         }
791         catch(exception& e) {
792                 m->errorOut(e, "ShhherCommand", "execute");
793                 exit(1);
794         }
795 }
796 /**************************************************************************************************/
797 string ShhherCommand::createNamesFile(){
798         try{
799                 
800                 vector<string> duplicateNames(numUniques, "");
801                 for(int i=0;i<numSeqs;i++){
802                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
803                 }
804                 
805                 string nameFileName = outputDir + m->getRootName(m->getSimpleName(flowFileName) + getOutputFileNameTag("name");
806                 
807                 ofstream nameFile;
808                 m->openOutputFile(nameFileName, nameFile);
809                 
810                 for(int i=0;i<numUniques;i++){
811                         
812                         if (m->control_pressed) { break; }
813                         
814             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
815                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
816                 }
817                 
818                 nameFile.close();
819                 return  nameFileName;
820         }
821         catch(exception& e) {
822                 m->errorOut(e, "ShhherCommand", "createNamesFile");
823                 exit(1);
824         }
825 }
826 /**************************************************************************************************/
827
828 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
829         try{            
830                 ostringstream outStream;
831                 outStream.setf(ios::fixed, ios::floatfield);
832                 outStream.setf(ios::dec, ios::basefield);
833                 outStream.setf(ios::showpoint);
834                 outStream.precision(6);
835                 
836                 int begTime = time(NULL);
837                 double begClock = clock();
838                 
839                 for(int i=startSeq;i<stopSeq;i++){
840                         
841                         if (m->control_pressed) { break; }
842                         
843                         for(int j=0;j<i;j++){
844                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
845                                 
846                                 if(flowDistance < 1e-6){
847                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
848                                 }
849                                 else if(flowDistance <= cutoff){
850                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
851                                 }
852                         }
853                         if(i % 100 == 0){
854                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
855                         }
856                 }
857                 
858                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
859                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
860                 
861                 if (m->control_pressed) { return fDistFileName; }
862                 
863                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
864
865                 ofstream distFile(fDistFileName.c_str());
866                 distFile << outStream.str();            
867                 distFile.close();
868                 
869                 return fDistFileName;
870         }
871         catch(exception& e) {
872                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
873                 exit(1);
874         }
875 }
876 /**************************************************************************************************/
877  
878 void ShhherCommand::getOTUData(string listFileName){
879     try {
880         
881         ifstream listFile;
882         m->openInputFile(listFileName, listFile);
883         string label;
884         
885         listFile >> label >> numOTUs;
886         
887         otuData.assign(numSeqs, 0);
888         cumNumSeqs.assign(numOTUs, 0);
889         nSeqsPerOTU.assign(numOTUs, 0);
890         aaP.clear();aaP.resize(numOTUs);
891         
892         seqNumber.clear();
893         aaI.clear();
894         seqIndex.clear();
895         
896         string singleOTU = "";
897         
898         for(int i=0;i<numOTUs;i++){
899             
900             if (m->control_pressed) { break; }
901             
902             listFile >> singleOTU;
903             
904             istringstream otuString(singleOTU);
905             
906             while(otuString){
907                 
908                 string seqName = "";
909                 
910                 for(int j=0;j<singleOTU.length();j++){
911                     char letter = otuString.get();
912                     
913                     if(letter != ','){
914                         seqName += letter;
915                     }
916                     else{
917                         map<string,int>::iterator nmIt = nameMap.find(seqName);
918                         int index = nmIt->second;
919                         
920                         nameMap.erase(nmIt);
921                         
922                         otuData[index] = i;
923                         nSeqsPerOTU[i]++;
924                         aaP[i].push_back(index);
925                         seqName = "";
926                     }
927                 }
928                 
929                 map<string,int>::iterator nmIt = nameMap.find(seqName);
930                 
931                 int index = nmIt->second;
932                 nameMap.erase(nmIt);
933                 
934                 otuData[index] = i;
935                 nSeqsPerOTU[i]++;
936                 aaP[i].push_back(index);        
937                 
938                 otuString.get();
939             }
940             
941             sort(aaP[i].begin(), aaP[i].end());
942             for(int j=0;j<nSeqsPerOTU[i];j++){
943                 seqNumber.push_back(aaP[i][j]);
944             }
945             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
946                 aaP[i].push_back(0);
947             }
948             
949             
950         }
951         
952         for(int i=1;i<numOTUs;i++){
953             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
954         }
955         aaI = aaP;
956         seqIndex = seqNumber;
957         
958         listFile.close();       
959         
960     }
961     catch(exception& e) {
962         m->errorOut(e, "ShhherCommand", "getOTUData");
963         exit(1);        
964     }           
965 }
966
967 /**************************************************************************************************/
968
969 void ShhherCommand::initPyroCluster(){                          
970     try{
971         if (numOTUs < processors) { processors = 1; }
972         
973         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
974         
975         dist.assign(numSeqs * numOTUs, 0);
976         change.assign(numOTUs, 1);
977         centroids.assign(numOTUs, -1);
978         weight.assign(numOTUs, 0);
979         singleTau.assign(numSeqs, 1.0);
980         
981         nSeqsBreaks.assign(processors+1, 0);
982         nOTUsBreaks.assign(processors+1, 0);
983         
984         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
985         
986         nSeqsBreaks[0] = 0;
987         for(int i=0;i<processors;i++){
988             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
989             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
990         }
991         nSeqsBreaks[processors] = numSeqs;
992         nOTUsBreaks[processors] = numOTUs;
993     }
994     catch(exception& e) {
995         m->errorOut(e, "ShhherCommand", "initPyroCluster");
996         exit(1);        
997     }
998 }
999
1000 /**************************************************************************************************/
1001
1002 void ShhherCommand::fill(){
1003     try {
1004         int index = 0;
1005         for(int i=0;i<numOTUs;i++){
1006             
1007             if (m->control_pressed) { break; }
1008             
1009             cumNumSeqs[i] = index;
1010             for(int j=0;j<nSeqsPerOTU[i];j++){
1011                 seqNumber[index] = aaP[i][j];
1012                 seqIndex[index] = aaI[i][j];
1013                 
1014                 index++;
1015             }
1016         }
1017     }
1018     catch(exception& e) {
1019         m->errorOut(e, "ShhherCommand", "fill");
1020         exit(1);        
1021     }           
1022 }
1023
1024 /**************************************************************************************************/
1025  
1026 void ShhherCommand::getFlowData(){
1027     try{
1028         ifstream flowFile;
1029         m->openInputFile(flowFileName, flowFile);
1030         
1031         string seqName;
1032         seqNameVector.clear();
1033         lengths.clear();
1034         flowDataIntI.clear();
1035         nameMap.clear();
1036         
1037         
1038         int currentNumFlowCells;
1039         
1040         float intensity;
1041         
1042         flowFile >> numFlowCells;
1043         int index = 0;//pcluster
1044         while(!flowFile.eof()){
1045             
1046             if (m->control_pressed) { break; }
1047             
1048             flowFile >> seqName >> currentNumFlowCells;
1049             lengths.push_back(currentNumFlowCells);
1050             
1051             seqNameVector.push_back(seqName);
1052             nameMap[seqName] = index++;//pcluster
1053             
1054             for(int i=0;i<numFlowCells;i++){
1055                 flowFile >> intensity;
1056                 if(intensity > 9.99)    {       intensity = 9.99;       }
1057                 int intI = int(100 * intensity + 0.0001);
1058                 flowDataIntI.push_back(intI);
1059             }
1060             m->gobble(flowFile);
1061         }
1062         flowFile.close();
1063         
1064         numSeqs = seqNameVector.size();         
1065         
1066         for(int i=0;i<numSeqs;i++){
1067             
1068             if (m->control_pressed) { break; }
1069             
1070             int iNumFlowCells = i * numFlowCells;
1071             for(int j=lengths[i];j<numFlowCells;j++){
1072                 flowDataIntI[iNumFlowCells + j] = 0;
1073             }
1074         }
1075         
1076     }
1077     catch(exception& e) {
1078         m->errorOut(e, "ShhherCommand", "getFlowData");
1079         exit(1);
1080     }
1081 }
1082 /**************************************************************************************************/
1083 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1084         
1085         try{
1086                 vector<double> newTau(numOTUs,0);
1087                 vector<double> norms(numSeqs, 0);
1088                 otuIndex.clear();
1089                 seqIndex.clear();
1090                 singleTau.clear();
1091                 
1092                 for(int i=startSeq;i<stopSeq;i++){
1093                         
1094                         if (m->control_pressed) { break; }
1095                         
1096                         double offset = 1e8;
1097                         int indexOffset = i * numOTUs;
1098                         
1099                         for(int j=0;j<numOTUs;j++){
1100                                 
1101                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1102                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1103                                 }
1104                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1105                                         offset = dist[indexOffset + j];
1106                                 }
1107                         }
1108                         
1109                         for(int j=0;j<numOTUs;j++){
1110                                 if(weight[j] > MIN_WEIGHT){
1111                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1112                                         norms[i] += newTau[j];
1113                                 }
1114                                 else{
1115                                         newTau[j] = 0.0;
1116                                 }
1117                         }
1118                         
1119                         for(int j=0;j<numOTUs;j++){
1120                 
1121                                 newTau[j] /= norms[i];
1122                                 
1123                                 if(newTau[j] > MIN_TAU){
1124                                         otuIndex.push_back(j);
1125                                         seqIndex.push_back(i);
1126                                         singleTau.push_back(newTau[j]);
1127                                 }
1128                         }
1129                         
1130                 }
1131         }
1132         catch(exception& e) {
1133                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1134                 exit(1);        
1135         }               
1136 }
1137
1138 /**************************************************************************************************/
1139
1140 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1141     
1142     try{
1143         
1144         int total = 0;
1145         vector<double> newTau(numOTUs,0);
1146         vector<double> norms(numSeqs, 0);
1147         nSeqsPerOTU.assign(numOTUs, 0);
1148         
1149         for(int i=startSeq;i<stopSeq;i++){
1150             
1151             if (m->control_pressed) { break; }
1152             
1153             int indexOffset = i * numOTUs;
1154             
1155             double offset = 1e8;
1156             
1157             for(int j=0;j<numOTUs;j++){
1158                 
1159                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1160                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1161                 }
1162                 
1163                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1164                     offset = dist[indexOffset + j];
1165                 }
1166             }
1167             
1168             for(int j=0;j<numOTUs;j++){
1169                 if(weight[j] > MIN_WEIGHT){
1170                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1171                     norms[i] += newTau[j];
1172                 }
1173                 else{
1174                     newTau[j] = 0.0;
1175                 }
1176             }
1177             
1178             for(int j=0;j<numOTUs;j++){
1179                 newTau[j] /= norms[i];
1180             }
1181             
1182             for(int j=0;j<numOTUs;j++){
1183                 if(newTau[j] > MIN_TAU){
1184                     
1185                     int oldTotal = total;
1186                     
1187                     total++;
1188                     
1189                     singleTau.resize(total, 0);
1190                     seqNumber.resize(total, 0);
1191                     seqIndex.resize(total, 0);
1192                     
1193                     singleTau[oldTotal] = newTau[j];
1194                     
1195                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1196                     aaI[j][nSeqsPerOTU[j]] = i;
1197                     nSeqsPerOTU[j]++;
1198                 }
1199             }
1200             
1201         }
1202         
1203     }
1204     catch(exception& e) {
1205         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1206         exit(1);        
1207     }           
1208 }
1209
1210 /**************************************************************************************************/
1211
1212 void ShhherCommand::setOTUs(){
1213     
1214     try {
1215         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1216         
1217         for(int i=0;i<numOTUs;i++){
1218             
1219             if (m->control_pressed) { break; }
1220             
1221             for(int j=0;j<nSeqsPerOTU[i];j++){
1222                 int index = cumNumSeqs[i] + j;
1223                 double tauValue = singleTau[seqNumber[index]];
1224                 int sIndex = seqIndex[index];
1225                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1226             }
1227         }
1228         
1229         for(int i=0;i<numSeqs;i++){
1230             double maxTau = -1.0000;
1231             int maxOTU = -1;
1232             for(int j=0;j<numOTUs;j++){
1233                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1234                     maxTau = bigTauMatrix[i * numOTUs + j];
1235                     maxOTU = j;
1236                 }
1237             }
1238             
1239             otuData[i] = maxOTU;
1240         }
1241         
1242         nSeqsPerOTU.assign(numOTUs, 0);         
1243         
1244         for(int i=0;i<numSeqs;i++){
1245             int index = otuData[i];
1246             
1247             singleTau[i] = 1.0000;
1248             dist[i] = 0.0000;
1249             
1250             aaP[index][nSeqsPerOTU[index]] = i;
1251             aaI[index][nSeqsPerOTU[index]] = i;
1252             
1253             nSeqsPerOTU[index]++;
1254         }
1255         fill(); 
1256     }
1257     catch(exception& e) {
1258         m->errorOut(e, "ShhherCommand", "setOTUs");
1259         exit(1);        
1260     }           
1261 }
1262
1263 /**************************************************************************************************/
1264  
1265 void ShhherCommand::getUniques(){
1266     try{
1267         
1268         
1269         numUniques = 0;
1270         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1271         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1272         uniqueLengths.assign(numSeqs, 0);
1273         mapSeqToUnique.assign(numSeqs, -1);
1274         mapUniqueToSeq.assign(numSeqs, -1);
1275         
1276         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1277         
1278         for(int i=0;i<numSeqs;i++){
1279             
1280             if (m->control_pressed) { break; }
1281             
1282             int index = 0;
1283             
1284             vector<short> current(numFlowCells);
1285             for(int j=0;j<numFlowCells;j++){
1286                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1287             }
1288             
1289             for(int j=0;j<numUniques;j++){
1290                 int offset = j * numFlowCells;
1291                 bool toEnd = 1;
1292                 
1293                 int shorterLength;
1294                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1295                 else                                                            {       shorterLength = uniqueLengths[j];       }
1296                 
1297                 for(int k=0;k<shorterLength;k++){
1298                     if(current[k] != uniqueFlowgrams[offset + k]){
1299                         toEnd = 0;
1300                         break;
1301                     }
1302                 }
1303                 
1304                 if(toEnd){
1305                     mapSeqToUnique[i] = j;
1306                     uniqueCount[j]++;
1307                     index = j;
1308                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1309                     break;
1310                 }
1311                 index++;
1312             }
1313             
1314             if(index == numUniques){
1315                 uniqueLengths[numUniques] = lengths[i];
1316                 uniqueCount[numUniques] = 1;
1317                 mapSeqToUnique[i] = numUniques;//anMap
1318                 mapUniqueToSeq[numUniques] = i;//anF
1319                 
1320                 for(int k=0;k<numFlowCells;k++){
1321                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1322                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1323                 }
1324                 
1325                 numUniques++;
1326             }
1327         }
1328         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1329         uniqueLengths.resize(numUniques);       
1330         
1331         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1332         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1333     }
1334     catch(exception& e) {
1335         m->errorOut(e, "ShhherCommand", "getUniques");
1336         exit(1);
1337     }
1338 }
1339
1340 /**************************************************************************************************/
1341
1342 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1343     try{
1344         int minLength = lengths[mapSeqToUnique[seqA]];
1345         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1346         
1347         int ANumFlowCells = seqA * numFlowCells;
1348         int BNumFlowCells = seqB * numFlowCells;
1349         
1350         float dist = 0;
1351         
1352         for(int i=0;i<minLength;i++){
1353             
1354             if (m->control_pressed) { break; }
1355             
1356             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1357             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1358             
1359             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1360             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1361             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1362         }
1363         
1364         dist /= (float) minLength;
1365         return dist;
1366     }
1367     catch(exception& e) {
1368         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1369         exit(1);
1370     }
1371 }
1372
1373 //**********************************************************************************************************************/
1374
1375 string ShhherCommand::cluster(string distFileName, string namesFileName){
1376     try {
1377         
1378         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1379         read->setCutoff(cutoff);
1380         
1381         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1382         clusterNameMap->readMap();
1383         read->read(clusterNameMap);
1384         
1385         ListVector* list = read->getListVector();
1386         SparseMatrix* matrix = read->getMatrix();
1387         
1388         delete read; 
1389         delete clusterNameMap; 
1390         
1391         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1392         
1393         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1394         string tag = cluster->getTag();
1395         
1396         double clusterCutoff = cutoff;
1397         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1398             
1399             if (m->control_pressed) { break; }
1400             
1401             cluster->update(clusterCutoff);
1402         }
1403         
1404         list->setLabel(toString(cutoff));
1405         
1406         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1407         ofstream listFile;
1408         m->openOutputFile(listFileName, listFile);
1409         list->print(listFile);
1410         listFile.close();
1411         
1412         delete matrix;  delete cluster; delete rabund; delete list;
1413         
1414         return listFileName;
1415     }
1416     catch(exception& e) {
1417         m->errorOut(e, "ShhherCommand", "cluster");
1418         exit(1);        
1419     }           
1420 }
1421
1422 /**************************************************************************************************/
1423
1424 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1425     
1426     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1427     //within an otu
1428     
1429     try{
1430         
1431         for(int i=start;i<finish;i++){
1432             
1433             if (m->control_pressed) { break; }
1434             
1435             double count = 0;
1436             int position = 0;
1437             int minFlowGram = 100000000;
1438             double minFlowValue = 1e8;
1439             change[i] = 0; //FALSE
1440             
1441             for(int j=0;j<nSeqsPerOTU[i];j++){
1442                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1443             }
1444             
1445             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1446                 vector<double> adF(nSeqsPerOTU[i]);
1447                 vector<int> anL(nSeqsPerOTU[i]);
1448                 
1449                 for(int j=0;j<nSeqsPerOTU[i];j++){
1450                     int index = cumNumSeqs[i] + j;
1451                     int nI = seqIndex[index];
1452                     int nIU = mapSeqToUnique[nI];
1453                     
1454                     int k;
1455                     for(k=0;k<position;k++){
1456                         if(nIU == anL[k]){
1457                             break;
1458                         }
1459                     }
1460                     if(k == position){
1461                         anL[position] = nIU;
1462                         adF[position] = 0.0000;
1463                         position++;
1464                     }                                           
1465                 }
1466                 
1467                 for(int j=0;j<nSeqsPerOTU[i];j++){
1468                     int index = cumNumSeqs[i] + j;
1469                     int nI = seqIndex[index];
1470                     
1471                     double tauValue = singleTau[seqNumber[index]];
1472                     
1473                     for(int k=0;k<position;k++){
1474                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1475                         adF[k] += dist * tauValue;
1476                     }
1477                 }
1478                 
1479                 for(int j=0;j<position;j++){
1480                     if(adF[j] < minFlowValue){
1481                         minFlowGram = j;
1482                         minFlowValue = adF[j];
1483                     }
1484                 }
1485                 
1486                 if(centroids[i] != anL[minFlowGram]){
1487                     change[i] = 1;
1488                     centroids[i] = anL[minFlowGram];
1489                 }
1490             }
1491             else if(centroids[i] != -1){
1492                 change[i] = 1;
1493                 centroids[i] = -1;                      
1494             }
1495         }
1496     }
1497     catch(exception& e) {
1498         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1499         exit(1);        
1500     }           
1501 }
1502
1503 /**************************************************************************************************/
1504
1505 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1506     try{
1507         
1508         int flowAValue = cent * numFlowCells;
1509         int flowBValue = flow * numFlowCells;
1510         
1511         double dist = 0;
1512         
1513         for(int i=0;i<length;i++){
1514             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1515             flowAValue++;
1516             flowBValue++;
1517         }
1518         
1519         return dist / (double)length;
1520     }
1521     catch(exception& e) {
1522         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1523         exit(1);        
1524     }           
1525 }
1526
1527 /**************************************************************************************************/
1528
1529 double ShhherCommand::getNewWeights(){
1530     try{
1531         
1532         double maxChange = 0;
1533         
1534         for(int i=0;i<numOTUs;i++){
1535             
1536             if (m->control_pressed) { break; }
1537             
1538             double difference = weight[i];
1539             weight[i] = 0;
1540             
1541             for(int j=0;j<nSeqsPerOTU[i];j++){
1542                 int index = cumNumSeqs[i] + j;
1543                 double tauValue = singleTau[seqNumber[index]];
1544                 weight[i] += tauValue;
1545             }
1546             
1547             difference = fabs(weight[i] - difference);
1548             if(difference > maxChange){ maxChange = difference; }
1549         }
1550         return maxChange;
1551     }
1552     catch(exception& e) {
1553         m->errorOut(e, "ShhherCommand", "getNewWeights");
1554         exit(1);        
1555     }           
1556 }
1557  
1558  /**************************************************************************************************/
1559  
1560 double ShhherCommand::getLikelihood(){
1561     
1562     try{
1563         
1564         vector<long double> P(numSeqs, 0);
1565         int effNumOTUs = 0;
1566         
1567         for(int i=0;i<numOTUs;i++){
1568             if(weight[i] > MIN_WEIGHT){
1569                 effNumOTUs++;
1570             }
1571         }
1572         
1573         string hold;
1574         for(int i=0;i<numOTUs;i++){
1575             
1576             if (m->control_pressed) { break; }
1577             
1578             for(int j=0;j<nSeqsPerOTU[i];j++){
1579                 int index = cumNumSeqs[i] + j;
1580                 int nI = seqIndex[index];
1581                 double singleDist = dist[seqNumber[index]];
1582                 
1583                 P[nI] += weight[i] * exp(-singleDist * sigma);
1584             }
1585         }
1586         double nLL = 0.00;
1587         for(int i=0;i<numSeqs;i++){
1588             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1589             
1590             nLL += -log(P[i]);
1591         }
1592         
1593         nLL = nLL -(double)numSeqs * log(sigma);
1594         
1595         return nLL; 
1596     }
1597     catch(exception& e) {
1598         m->errorOut(e, "ShhherCommand", "getNewWeights");
1599         exit(1);        
1600     }           
1601 }
1602
1603 /**************************************************************************************************/
1604
1605 void ShhherCommand::checkCentroids(){
1606     try{
1607         vector<int> unique(numOTUs, 1);
1608         
1609         for(int i=0;i<numOTUs;i++){
1610             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1611                 unique[i] = -1;
1612             }
1613         }
1614         
1615         for(int i=0;i<numOTUs;i++){
1616             
1617             if (m->control_pressed) { break; }
1618             
1619             if(unique[i] == 1){
1620                 for(int j=i+1;j<numOTUs;j++){
1621                     if(unique[j] == 1){
1622                         
1623                         if(centroids[j] == centroids[i]){
1624                             unique[j] = 0;
1625                             centroids[j] = -1;
1626                             
1627                             weight[i] += weight[j];
1628                             weight[j] = 0.0;
1629                         }
1630                     }
1631                 }
1632             }
1633         }
1634     }
1635     catch(exception& e) {
1636         m->errorOut(e, "ShhherCommand", "checkCentroids");
1637         exit(1);        
1638     }           
1639 }
1640  /**************************************************************************************************/
1641
1642
1643  
1644 void ShhherCommand::writeQualities(vector<int> otuCounts){
1645     
1646     try {
1647         string thisOutputDir = outputDir;
1648         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1649         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
1650         
1651         ofstream qualityFile;
1652         m->openOutputFile(qualityFileName, qualityFile);
1653         
1654         qualityFile.setf(ios::fixed, ios::floatfield);
1655         qualityFile.setf(ios::showpoint);
1656         qualityFile << setprecision(6);
1657         
1658         vector<vector<int> > qualities(numOTUs);
1659         vector<double> pr(HOMOPS, 0);
1660         
1661         
1662         for(int i=0;i<numOTUs;i++){
1663             
1664             if (m->control_pressed) { break; }
1665             
1666             int index = 0;
1667             int base = 0;
1668             
1669             if(nSeqsPerOTU[i] > 0){
1670                 qualities[i].assign(1024, -1);
1671                 
1672                 while(index < numFlowCells){
1673                     double maxPrValue = 1e8;
1674                     short maxPrIndex = -1;
1675                     double count = 0.0000;
1676                     
1677                     pr.assign(HOMOPS, 0);
1678                     
1679                     for(int j=0;j<nSeqsPerOTU[i];j++){
1680                         int lIndex = cumNumSeqs[i] + j;
1681                         double tauValue = singleTau[seqNumber[lIndex]];
1682                         int sequenceIndex = aaI[i][j];
1683                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1684                         
1685                         count += tauValue;
1686                         
1687                         for(int s=0;s<HOMOPS;s++){
1688                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1689                         }
1690                     }
1691                     
1692                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1693                     maxPrValue = pr[maxPrIndex];
1694                     
1695                     if(count > MIN_COUNT){
1696                         double U = 0.0000;
1697                         double norm = 0.0000;
1698                         
1699                         for(int s=0;s<HOMOPS;s++){
1700                             norm += exp(-(pr[s] - maxPrValue));
1701                         }
1702                         
1703                         for(int s=1;s<=maxPrIndex;s++){
1704                             int value = 0;
1705                             double temp = 0.0000;
1706                             
1707                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1708                             
1709                             if(U>0.00){
1710                                 temp = log10(U);
1711                             }
1712                             else{
1713                                 temp = -10.1;
1714                             }
1715                             temp = floor(-10 * temp);
1716                             value = (int)floor(temp);
1717                             if(value > 100){    value = 100;    }
1718                             
1719                             qualities[i][base] = (int)value;
1720                             base++;
1721                         }
1722                     }
1723                     
1724                     index++;
1725                 }
1726             }
1727             
1728             
1729             if(otuCounts[i] > 0){
1730                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1731                 
1732                 int j=4;        //need to get past the first four bases
1733                 while(qualities[i][j] != -1){
1734                     qualityFile << qualities[i][j] << ' ';
1735                     j++;
1736                 }
1737                 qualityFile << endl;
1738             }
1739         }
1740         qualityFile.close();
1741         outputNames.push_back(qualityFileName);
1742         
1743     }
1744     catch(exception& e) {
1745         m->errorOut(e, "ShhherCommand", "writeQualities");
1746         exit(1);        
1747     }           
1748 }
1749
1750 /**************************************************************************************************/
1751
1752 void ShhherCommand::writeSequences(vector<int> otuCounts){
1753     try {
1754         string thisOutputDir = outputDir;
1755         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1756         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
1757         ofstream fastaFile;
1758         m->openOutputFile(fastaFileName, fastaFile);
1759         
1760         vector<string> names(numOTUs, "");
1761         
1762         for(int i=0;i<numOTUs;i++){
1763             
1764             if (m->control_pressed) { break; }
1765             
1766             int index = centroids[i];
1767             
1768             if(otuCounts[i] > 0){
1769                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1770                 
1771                 string newSeq = "";
1772                 
1773                 for(int j=0;j<numFlowCells;j++){
1774                     
1775                     char base = flowOrder[j % 4];
1776                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1777                         newSeq += base;
1778                     }
1779                 }
1780                 
1781                 fastaFile << newSeq.substr(4) << endl;
1782             }
1783         }
1784         fastaFile.close();
1785         
1786         outputNames.push_back(fastaFileName);
1787         
1788         if(compositeFASTAFileName != ""){
1789             m->appendFiles(fastaFileName, compositeFASTAFileName);
1790         }
1791     }
1792     catch(exception& e) {
1793         m->errorOut(e, "ShhherCommand", "writeSequences");
1794         exit(1);        
1795     }           
1796 }
1797
1798 /**************************************************************************************************/
1799
1800 void ShhherCommand::writeNames(vector<int> otuCounts){
1801     try {
1802         string thisOutputDir = outputDir;
1803         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1804         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
1805         ofstream nameFile;
1806         m->openOutputFile(nameFileName, nameFile);
1807         
1808         for(int i=0;i<numOTUs;i++){
1809             
1810             if (m->control_pressed) { break; }
1811             
1812             if(otuCounts[i] > 0){
1813                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1814                 
1815                 for(int j=1;j<nSeqsPerOTU[i];j++){
1816                     nameFile << ',' << seqNameVector[aaI[i][j]];
1817                 }
1818                 
1819                 nameFile << endl;
1820             }
1821         }
1822         nameFile.close();
1823         outputNames.push_back(nameFileName);
1824         
1825         
1826         if(compositeNamesFileName != ""){
1827             m->appendFiles(nameFileName, compositeNamesFileName);
1828         }               
1829     }
1830     catch(exception& e) {
1831         m->errorOut(e, "ShhherCommand", "writeNames");
1832         exit(1);        
1833     }           
1834 }
1835
1836 /**************************************************************************************************/
1837
1838 void ShhherCommand::writeGroups(){
1839     try {
1840         string thisOutputDir = outputDir;
1841         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1842         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1843         string groupFileName = fileRoot + getOutputFileNameTag("group");
1844         ofstream groupFile;
1845         m->openOutputFile(groupFileName, groupFile);
1846         
1847         for(int i=0;i<numSeqs;i++){
1848             if (m->control_pressed) { break; }
1849             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1850         }
1851         groupFile.close();
1852         outputNames.push_back(groupFileName);
1853         
1854     }
1855     catch(exception& e) {
1856         m->errorOut(e, "ShhherCommand", "writeGroups");
1857         exit(1);        
1858     }           
1859 }
1860
1861 /**************************************************************************************************/
1862
1863 void ShhherCommand::writeClusters(vector<int> otuCounts){
1864     try {
1865         string thisOutputDir = outputDir;
1866         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1867         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) +getOutputFileNameTag("counts");
1868         ofstream otuCountsFile;
1869         m->openOutputFile(otuCountsFileName, otuCountsFile);
1870         
1871         string bases = flowOrder;
1872         
1873         for(int i=0;i<numOTUs;i++){
1874             
1875             if (m->control_pressed) {
1876                 break;
1877             }
1878             //output the translated version of the centroid sequence for the otu
1879             if(otuCounts[i] > 0){
1880                 int index = centroids[i];
1881                 
1882                 otuCountsFile << "ideal\t";
1883                 for(int j=8;j<numFlowCells;j++){
1884                     char base = bases[j % 4];
1885                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1886                         otuCountsFile << base;
1887                     }
1888                 }
1889                 otuCountsFile << endl;
1890                 
1891                 for(int j=0;j<nSeqsPerOTU[i];j++){
1892                     int sequence = aaI[i][j];
1893                     otuCountsFile << seqNameVector[sequence] << '\t';
1894                     
1895                     string newSeq = "";
1896                     
1897                     for(int k=0;k<lengths[sequence];k++){
1898                         char base = bases[k % 4];
1899                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1900                         
1901                         for(int s=0;s<freq;s++){
1902                             newSeq += base;
1903                             //otuCountsFile << base;
1904                         }
1905                     }
1906                     otuCountsFile << newSeq.substr(4) << endl;
1907                 }
1908                 otuCountsFile << endl;
1909             }
1910         }
1911         otuCountsFile.close();
1912         outputNames.push_back(otuCountsFileName);
1913         
1914     }
1915     catch(exception& e) {
1916         m->errorOut(e, "ShhherCommand", "writeClusters");
1917         exit(1);        
1918     }           
1919 }
1920
1921 #else
1922 //**********************************************************************************************************************
1923
1924 int ShhherCommand::execute(){
1925         try {
1926                 if (abort == true) { return 0; }
1927                 
1928                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1929                 getJointLookUp();       if (m->control_pressed) { return 0; }
1930                 
1931         int numFiles = flowFileVector.size();
1932                 
1933         if (numFiles < processors) { processors = numFiles; }
1934         
1935 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1936         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1937         else { createProcesses(flowFileVector); } //each processor processes one file
1938 #else
1939         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1940 #endif
1941         
1942                 if(compositeFASTAFileName != ""){
1943                         outputNames.push_back(compositeFASTAFileName);
1944                         outputNames.push_back(compositeNamesFileName);
1945                 }
1946
1947                 m->mothurOutEndLine();
1948                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1949                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1950                 m->mothurOutEndLine();
1951                 
1952                 return 0;
1953         }
1954         catch(exception& e) {
1955                 m->errorOut(e, "ShhherCommand", "execute");
1956                 exit(1);
1957         }
1958 }
1959 #endif
1960 /**************************************************************************************************/
1961
1962 int ShhherCommand::createProcesses(vector<string> filenames){
1963     try {
1964         vector<int> processIDS;
1965                 int process = 1;
1966                 int num = 0;
1967                 
1968                 //sanity check
1969                 if (filenames.size() < processors) { processors = filenames.size(); }
1970                 
1971                 //divide the groups between the processors
1972                 vector<linePair> lines;
1973         vector<int> numFilesToComplete;
1974                 int numFilesPerProcessor = filenames.size() / processors;
1975                 for (int i = 0; i < processors; i++) {
1976                         int startIndex =  i * numFilesPerProcessor;
1977                         int endIndex = (i+1) * numFilesPerProcessor;
1978                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1979                         lines.push_back(linePair(startIndex, endIndex));
1980             numFilesToComplete.push_back((endIndex-startIndex));
1981                 }
1982                 
1983         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1984                 
1985                 //loop through and create all the processes you want
1986                 while (process != processors) {
1987                         int pid = fork();
1988                         
1989                         if (pid > 0) {
1990                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1991                                 process++;
1992                         }else if (pid == 0){
1993                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1994                 
1995                 //pass numSeqs to parent
1996                                 ofstream out;
1997                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
1998                                 m->openOutputFile(tempFile, out);
1999                                 out << num << endl;
2000                                 out.close();
2001                 
2002                                 exit(0);
2003                         }else { 
2004                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2005                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2006                                 exit(0);
2007                         }
2008                 }
2009                 
2010                 //do my part
2011                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
2012                 
2013                 //force parent to wait until all the processes are done
2014                 for (int i=0;i<processIDS.size();i++) { 
2015                         int temp = processIDS[i];
2016                         wait(&temp);
2017                 }
2018         
2019         #else
2020         
2021         //////////////////////////////////////////////////////////////////////////////////////////////////////
2022         
2023         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2024         
2025         //////////////////////////////////////////////////////////////////////////////////////////////////////
2026                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2027                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2028                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2029                 
2030                 vector<shhhFlowsData*> pDataArray; 
2031                 DWORD   dwThreadIdArray[processors-1];
2032                 HANDLE  hThreadArray[processors-1]; 
2033                 
2034                 //Create processor worker threads.
2035                 for( int i=0; i<processors-1; i++ ){
2036                         // Allocate memory for thread data.
2037                         string extension = "";
2038                         if (i != 0) { extension = toString(i) + ".temp"; }
2039                         
2040             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2041                         pDataArray.push_back(tempFlow);
2042                         processIDS.push_back(i);
2043             
2044                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2045                 }
2046                 
2047                 //using the main process as a worker saves time and memory
2048                 //do my part
2049                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2050                 
2051                 //Wait until all threads have terminated.
2052                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2053                 
2054                 //Close all thread handles and free memory allocations.
2055                 for(int i=0; i < pDataArray.size(); i++){
2056                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2057                         CloseHandle(hThreadArray[i]);
2058                         delete pDataArray[i];
2059                 }
2060                 
2061         #endif
2062         
2063         for (int i=0;i<processIDS.size();i++) { 
2064             ifstream in;
2065                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2066                         m->openInputFile(tempFile, in);
2067                         if (!in.eof()) { 
2068                 int tempNum = 0; 
2069                 in >> tempNum; 
2070                 if (tempNum != numFilesToComplete[i+1]) {
2071                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(numFilesToComplete[i+1]) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2072                 }
2073             }
2074                         in.close(); m->mothurRemove(tempFile);
2075             
2076             if (compositeFASTAFileName != "") {
2077                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2078                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2079                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2080                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2081             }
2082         }
2083         
2084         return 0;
2085         
2086     }
2087         catch(exception& e) {
2088                 m->errorOut(e, "ShhherCommand", "createProcesses");
2089                 exit(1);
2090         }
2091 }
2092 /**************************************************************************************************/
2093
2094 vector<string> ShhherCommand::parseFlowFiles(string filename){
2095     try {
2096         vector<string> files;
2097         int count = 0;
2098         
2099         ifstream in;
2100         m->openInputFile(filename, in);
2101         
2102         int thisNumFLows = 0;
2103         in >> thisNumFLows; m->gobble(in);
2104         
2105         while (!in.eof()) {
2106             if (m->control_pressed) { break; }
2107             
2108             ofstream out;
2109             string outputFileName = filename + toString(count) + ".temp";
2110             m->openOutputFile(outputFileName, out);
2111             out << thisNumFLows << endl;
2112             files.push_back(outputFileName);
2113             
2114             int numLinesWrote = 0;
2115             for (int i = 0; i < largeSize; i++) {
2116                 if (in.eof()) { break; }
2117                 string line = m->getline(in); m->gobble(in);
2118                 out << line << endl;
2119                 numLinesWrote++;
2120             }
2121             out.close();
2122             
2123             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2124             count++;
2125         }
2126         in.close();
2127         
2128         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2129         
2130         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2131         
2132         return files;
2133     }
2134         catch(exception& e) {
2135                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2136                 exit(1);
2137         }
2138 }
2139 /**************************************************************************************************/
2140
2141 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2142     try {
2143         
2144         int numCompleted = 0;
2145         
2146         for(int i=start;i<end;i++){
2147                         
2148                         if (m->control_pressed) { break; }
2149                         
2150             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2151             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2152             
2153             if (m->control_pressed) { break; }
2154             
2155             double begClock = clock();
2156             unsigned long long begTime;
2157             
2158             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2159                 
2160                 string flowFileName = theseFlowFileNames[g];
2161                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2162                 m->mothurOut("Reading flowgrams...\n");
2163                 
2164                 vector<string> seqNameVector;
2165                 vector<int> lengths;
2166                 vector<short> flowDataIntI;
2167                 vector<double> flowDataPrI;
2168                 map<string, int> nameMap;
2169                 vector<short> uniqueFlowgrams;
2170                 vector<int> uniqueCount;
2171                 vector<int> mapSeqToUnique;
2172                 vector<int> mapUniqueToSeq;
2173                 vector<int> uniqueLengths;
2174                 int numFlowCells;
2175                 
2176                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2177                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2178                 
2179                 if (m->control_pressed) { break; }
2180                 
2181                 m->mothurOut("Identifying unique flowgrams...\n");
2182                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2183                 
2184                 if (m->control_pressed) { break; }
2185                 
2186                 m->mothurOut("Calculating distances between flowgrams...\n");
2187                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2188                 begTime = time(NULL);
2189                
2190                 
2191                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI); 
2192                 
2193                 m->mothurOutEndLine();
2194                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2195                 
2196                 
2197                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2198                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2199                 
2200                 if (m->control_pressed) { break; }
2201                 
2202                 m->mothurOut("\nClustering flowgrams...\n");
2203                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2204                 cluster(listFileName, distFileName, namesFileName);
2205                 
2206                 if (m->control_pressed) { break; }
2207                 
2208                 vector<int> otuData;
2209                 vector<int> cumNumSeqs;
2210                 vector<int> nSeqsPerOTU;
2211                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2212                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2213                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2214                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2215                 
2216                 
2217                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2218                 
2219                 if (m->control_pressed) { break; }
2220                 
2221                 m->mothurRemove(distFileName);
2222                 m->mothurRemove(namesFileName);
2223                 m->mothurRemove(listFileName);
2224                 
2225                 vector<double> dist;            //adDist - distance of sequences to centroids
2226                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2227                 vector<int> centroids;          //the representative flowgram for each cluster m
2228                 vector<double> weight;
2229                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2230                 vector<int> nSeqsBreaks;
2231                 vector<int> nOTUsBreaks;
2232                 
2233                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2234                 
2235                 dist.assign(numSeqs * numOTUs, 0);
2236                 change.assign(numOTUs, 1);
2237                 centroids.assign(numOTUs, -1);
2238                 weight.assign(numOTUs, 0);
2239                 singleTau.assign(numSeqs, 1.0);
2240                 
2241                 nSeqsBreaks.assign(2, 0);
2242                 nOTUsBreaks.assign(2, 0);
2243                 
2244                 nSeqsBreaks[0] = 0;
2245                 nSeqsBreaks[1] = numSeqs;
2246                 nOTUsBreaks[1] = numOTUs;
2247                 
2248                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2249                 
2250                 if (m->control_pressed) { break; }
2251                 
2252                 double maxDelta = 0;
2253                 int iter = 0;
2254                 
2255                 begClock = clock();
2256                 begTime = time(NULL);
2257                 
2258                 m->mothurOut("\nDenoising flowgrams...\n");
2259                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2260                 
2261                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2262                     
2263                     if (m->control_pressed) { break; }
2264                     
2265                     double cycClock = clock();
2266                     unsigned long long cycTime = time(NULL);
2267                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2268                     
2269                     if (m->control_pressed) { break; }
2270                     
2271                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2272                     
2273                     if (m->control_pressed) { break; }
2274                     
2275                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2276                     
2277                     if (m->control_pressed) { break; }
2278                     
2279                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2280                     
2281                     if (m->control_pressed) { break; }
2282                     
2283                     checkCentroids(numOTUs, centroids, weight);
2284                     
2285                     if (m->control_pressed) { break; }
2286                     
2287                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2288                     
2289                     if (m->control_pressed) { break; }
2290                     
2291                     iter++;
2292                     
2293                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2294                     
2295                 }       
2296                 
2297                 if (m->control_pressed) { break; }
2298                 
2299                 m->mothurOut("\nFinalizing...\n");
2300                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2301                 
2302                 if (m->control_pressed) { break; }
2303                 
2304                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2305                 
2306                 if (m->control_pressed) { break; }
2307                 
2308                 vector<int> otuCounts(numOTUs, 0);
2309                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2310                 
2311                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2312                 
2313                 if (m->control_pressed) { break; }
2314                 
2315                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2316                 string thisOutputDir = outputDir;
2317                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2318                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("qfile");
2319                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("fasta");
2320                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("name");
2321                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + getOutputFileNameTag("counts");
2322                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2323                 string groupFileName = fileRoot + getOutputFileNameTag("group");
2324
2325                 
2326                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2327                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2328                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2329                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2330                 writeGroups(groupFileName, fileRoot, numSeqs, seqNameVector);                                           if (m->control_pressed) { break; }
2331                 
2332                 if (large) {
2333                     if (g > 0) {
2334                         m->appendFiles(qualityFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("qfile")));
2335                         m->mothurRemove(qualityFileName);
2336                         m->appendFiles(fastaFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("fasta")));
2337                         m->mothurRemove(fastaFileName);
2338                         m->appendFiles(nameFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("name")));
2339                         m->mothurRemove(nameFileName);
2340                         m->appendFiles(otuCountsFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("counts")));
2341                         m->mothurRemove(otuCountsFileName);
2342                         m->appendFiles(groupFileName, (thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0])) + getOutputFileNameTag("group")));
2343                         m->mothurRemove(groupFileName);
2344                     }
2345                     m->mothurRemove(theseFlowFileNames[g]);
2346                 }
2347                         }
2348             
2349             numCompleted++;
2350                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2351                 }
2352                 
2353         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2354         
2355         return numCompleted;
2356         
2357     }catch(exception& e) {
2358             m->errorOut(e, "ShhherCommand", "driver");
2359             exit(1);
2360     }
2361 }
2362
2363 /**************************************************************************************************/
2364 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2365         try{
2366        
2367                 ifstream flowFile;
2368        
2369                 m->openInputFile(filename, flowFile);
2370                 
2371                 string seqName;
2372                 int currentNumFlowCells;
2373                 float intensity;
2374         thisSeqNameVector.clear();
2375                 thisLengths.clear();
2376                 thisFlowDataIntI.clear();
2377                 thisNameMap.clear();
2378                 
2379                 flowFile >> numFlowCells;
2380         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2381                 int index = 0;//pcluster
2382                 while(!flowFile.eof()){
2383                         
2384                         if (m->control_pressed) { break; }
2385                         
2386                         flowFile >> seqName >> currentNumFlowCells;
2387             
2388                         thisLengths.push_back(currentNumFlowCells);
2389            
2390                         thisSeqNameVector.push_back(seqName);
2391                         thisNameMap[seqName] = index++;//pcluster
2392             
2393             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2394             
2395                         for(int i=0;i<numFlowCells;i++){
2396                                 flowFile >> intensity;
2397                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2398                                 int intI = int(100 * intensity + 0.0001);
2399                                 thisFlowDataIntI.push_back(intI);
2400                         }
2401                         m->gobble(flowFile);
2402                 }
2403                 flowFile.close();
2404                 
2405                 int numSeqs = thisSeqNameVector.size();         
2406                 
2407                 for(int i=0;i<numSeqs;i++){
2408                         
2409                         if (m->control_pressed) { break; }
2410                         
2411                         int iNumFlowCells = i * numFlowCells;
2412                         for(int j=thisLengths[i];j<numFlowCells;j++){
2413                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2414                         }
2415                 }
2416         
2417         return numSeqs;
2418                 
2419         }
2420         catch(exception& e) {
2421                 m->errorOut(e, "ShhherCommand", "getFlowData");
2422                 exit(1);
2423         }
2424 }
2425 /**************************************************************************************************/
2426
2427 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2428         try{            
2429         
2430                 ostringstream outStream;
2431                 outStream.setf(ios::fixed, ios::floatfield);
2432                 outStream.setf(ios::dec, ios::basefield);
2433                 outStream.setf(ios::showpoint);
2434                 outStream.precision(6);
2435                 
2436                 int begTime = time(NULL);
2437                 double begClock = clock();
2438         
2439                 for(int i=0;i<stopSeq;i++){
2440                         
2441                         if (m->control_pressed) { break; }
2442                         
2443                         for(int j=0;j<i;j++){
2444                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2445                 
2446                                 if(flowDistance < 1e-6){
2447                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2448                                 }
2449                                 else if(flowDistance <= cutoff){
2450                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2451                                 }
2452                         }
2453                         if(i % 100 == 0){
2454                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2455                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2456                                 m->mothurOutEndLine();
2457                         }
2458                 }
2459                 
2460                 ofstream distFile(distFileName.c_str());
2461                 distFile << outStream.str();            
2462                 distFile.close();
2463                 
2464                 if (m->control_pressed) {}
2465                 else {
2466                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2467                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2468                         m->mothurOutEndLine();
2469                 }
2470         
2471         return 0;
2472         }
2473         catch(exception& e) {
2474                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2475                 exit(1);
2476         }
2477 }
2478 /**************************************************************************************************/
2479
2480 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2481         try{
2482                 int minLength = lengths[mapSeqToUnique[seqA]];
2483                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2484                 
2485                 int ANumFlowCells = seqA * numFlowCells;
2486                 int BNumFlowCells = seqB * numFlowCells;
2487                 
2488                 float dist = 0;
2489                 
2490                 for(int i=0;i<minLength;i++){
2491                         
2492                         if (m->control_pressed) { break; }
2493                         
2494                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2495                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2496                         
2497                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2498                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2499                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2500                 }
2501                 
2502                 dist /= (float) minLength;
2503                 return dist;
2504         }
2505         catch(exception& e) {
2506                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2507                 exit(1);
2508         }
2509 }
2510
2511 /**************************************************************************************************/
2512
2513 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2514         try{
2515                 int numUniques = 0;
2516                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2517                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2518                 uniqueLengths.assign(numSeqs, 0);
2519                 mapSeqToUnique.assign(numSeqs, -1);
2520                 mapUniqueToSeq.assign(numSeqs, -1);
2521                 
2522                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2523                 
2524                 for(int i=0;i<numSeqs;i++){
2525                         
2526                         if (m->control_pressed) { break; }
2527                         
2528                         int index = 0;
2529                         
2530                         vector<short> current(numFlowCells);
2531                         for(int j=0;j<numFlowCells;j++){
2532                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2533                         }
2534             
2535                         for(int j=0;j<numUniques;j++){
2536                                 int offset = j * numFlowCells;
2537                                 bool toEnd = 1;
2538                                 
2539                                 int shorterLength;
2540                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2541                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2542                 
2543                                 for(int k=0;k<shorterLength;k++){
2544                                         if(current[k] != uniqueFlowgrams[offset + k]){
2545                                                 toEnd = 0;
2546                                                 break;
2547                                         }
2548                                 }
2549                                 
2550                                 if(toEnd){
2551                                         mapSeqToUnique[i] = j;
2552                                         uniqueCount[j]++;
2553                                         index = j;
2554                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2555                                         break;
2556                                 }
2557                                 index++;
2558                         }
2559                         
2560                         if(index == numUniques){
2561                                 uniqueLengths[numUniques] = lengths[i];
2562                                 uniqueCount[numUniques] = 1;
2563                                 mapSeqToUnique[i] = numUniques;//anMap
2564                                 mapUniqueToSeq[numUniques] = i;//anF
2565                                 
2566                                 for(int k=0;k<numFlowCells;k++){
2567                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2568                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2569                                 }
2570                                 
2571                                 numUniques++;
2572                         }
2573                 }
2574                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2575                 uniqueLengths.resize(numUniques);       
2576                 
2577                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2578                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2579         
2580         return numUniques;
2581         }
2582         catch(exception& e) {
2583                 m->errorOut(e, "ShhherCommand", "getUniques");
2584                 exit(1);
2585         }
2586 }
2587 /**************************************************************************************************/
2588 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2589         try{
2590                 
2591                 vector<string> duplicateNames(numUniques, "");
2592                 for(int i=0;i<numSeqs;i++){
2593                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2594                 }
2595                 
2596                 ofstream nameFile;
2597                 m->openOutputFile(filename, nameFile);
2598                 
2599                 for(int i=0;i<numUniques;i++){
2600                         
2601                         if (m->control_pressed) { break; }
2602                         
2603             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2604                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2605                 }
2606                 
2607                 nameFile.close();
2608         
2609                 return 0;
2610         }
2611         catch(exception& e) {
2612                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2613                 exit(1);
2614         }
2615 }
2616 //**********************************************************************************************************************
2617
2618 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2619         try {
2620                 
2621                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2622                 read->setCutoff(cutoff);
2623                 
2624                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2625                 clusterNameMap->readMap();
2626                 read->read(clusterNameMap);
2627         
2628                 ListVector* list = read->getListVector();
2629                 SparseMatrix* matrix = read->getMatrix();
2630                 
2631                 delete read; 
2632                 delete clusterNameMap; 
2633         
2634                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2635                 
2636                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2637                 string tag = cluster->getTag();
2638                 
2639                 double clusterCutoff = cutoff;
2640                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2641                         
2642                         if (m->control_pressed) { break; }
2643                         
2644                         cluster->update(clusterCutoff);
2645                 }
2646                 
2647                 list->setLabel(toString(cutoff));
2648                 
2649                 ofstream listFile;
2650                 m->openOutputFile(filename, listFile);
2651                 list->print(listFile);
2652                 listFile.close();
2653                 
2654                 delete matrix;  delete cluster; delete rabund; delete list;
2655         
2656                 return 0;
2657         }
2658         catch(exception& e) {
2659                 m->errorOut(e, "ShhherCommand", "cluster");
2660                 exit(1);        
2661         }               
2662 }
2663 /**************************************************************************************************/
2664
2665 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2666                                vector<int>& cumNumSeqs,
2667                                vector<int>& nSeqsPerOTU,
2668                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2669                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2670                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2671                                vector<int>& seqIndex,
2672                                map<string, int>& nameMap){
2673         try {
2674         
2675                 ifstream listFile;
2676                 m->openInputFile(fileName, listFile);
2677                 string label;
2678         int numOTUs;
2679                 
2680                 listFile >> label >> numOTUs;
2681         
2682         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2683         
2684                 otuData.assign(numSeqs, 0);
2685                 cumNumSeqs.assign(numOTUs, 0);
2686                 nSeqsPerOTU.assign(numOTUs, 0);
2687                 aaP.clear();aaP.resize(numOTUs);
2688                 
2689                 seqNumber.clear();
2690                 aaI.clear();
2691                 seqIndex.clear();
2692                 
2693                 string singleOTU = "";
2694                 
2695                 for(int i=0;i<numOTUs;i++){
2696                         
2697                         if (m->control_pressed) { break; }
2698             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2699             
2700                         listFile >> singleOTU;
2701                         
2702                         istringstream otuString(singleOTU);
2703             
2704                         while(otuString){
2705                                 
2706                                 string seqName = "";
2707                                 
2708                                 for(int j=0;j<singleOTU.length();j++){
2709                                         char letter = otuString.get();
2710                                         
2711                                         if(letter != ','){
2712                                                 seqName += letter;
2713                                         }
2714                                         else{
2715                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2716                                                 int index = nmIt->second;
2717                                                 
2718                                                 nameMap.erase(nmIt);
2719                                                 
2720                                                 otuData[index] = i;
2721                                                 nSeqsPerOTU[i]++;
2722                                                 aaP[i].push_back(index);
2723                                                 seqName = "";
2724                                         }
2725                                 }
2726                                 
2727                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2728                 
2729                                 int index = nmIt->second;
2730                                 nameMap.erase(nmIt);
2731                 
2732                                 otuData[index] = i;
2733                                 nSeqsPerOTU[i]++;
2734                                 aaP[i].push_back(index);        
2735                                 
2736                                 otuString.get();
2737                         }
2738                         
2739                         sort(aaP[i].begin(), aaP[i].end());
2740                         for(int j=0;j<nSeqsPerOTU[i];j++){
2741                                 seqNumber.push_back(aaP[i][j]);
2742                         }
2743                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2744                                 aaP[i].push_back(0);
2745                         }
2746                         
2747                         
2748                 }
2749                 
2750                 for(int i=1;i<numOTUs;i++){
2751                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2752                 }
2753                 aaI = aaP;
2754                 seqIndex = seqNumber;
2755                 
2756                 listFile.close();       
2757         
2758         return numOTUs;
2759                 
2760         }
2761         catch(exception& e) {
2762                 m->errorOut(e, "ShhherCommand", "getOTUData");
2763                 exit(1);        
2764         }               
2765 }
2766 /**************************************************************************************************/
2767
2768 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2769                                           vector<int>& cumNumSeqs,
2770                                           vector<int>& nSeqsPerOTU,
2771                                           vector<int>& seqIndex,
2772                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2773                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2774                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2775                                           vector<int>& mapSeqToUnique,
2776                                           vector<short>& uniqueFlowgrams,
2777                                           vector<short>& flowDataIntI,
2778                                           vector<int>& lengths,
2779                                           int numFlowCells,
2780                                           vector<int>& seqNumber){                          
2781         
2782         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2783         //within an otu
2784         
2785         try{
2786                 
2787                 for(int i=0;i<numOTUs;i++){
2788                         
2789                         if (m->control_pressed) { break; }
2790                         
2791                         double count = 0;
2792                         int position = 0;
2793                         int minFlowGram = 100000000;
2794                         double minFlowValue = 1e8;
2795                         change[i] = 0; //FALSE
2796                         
2797                         for(int j=0;j<nSeqsPerOTU[i];j++){
2798                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2799                         }
2800             
2801                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2802                                 vector<double> adF(nSeqsPerOTU[i]);
2803                                 vector<int> anL(nSeqsPerOTU[i]);
2804                                 
2805                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2806                                         int index = cumNumSeqs[i] + j;
2807                                         int nI = seqIndex[index];
2808                                         int nIU = mapSeqToUnique[nI];
2809                                         
2810                                         int k;
2811                                         for(k=0;k<position;k++){
2812                                                 if(nIU == anL[k]){
2813                                                         break;
2814                                                 }
2815                                         }
2816                                         if(k == position){
2817                                                 anL[position] = nIU;
2818                                                 adF[position] = 0.0000;
2819                                                 position++;
2820                                         }                                               
2821                                 }
2822                                 
2823                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2824                                         int index = cumNumSeqs[i] + j;
2825                                         int nI = seqIndex[index];
2826                                         
2827                                         double tauValue = singleTau[seqNumber[index]];
2828                                         
2829                                         for(int k=0;k<position;k++){
2830                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2831                                                 adF[k] += dist * tauValue;
2832                                         }
2833                                 }
2834                                 
2835                                 for(int j=0;j<position;j++){
2836                                         if(adF[j] < minFlowValue){
2837                                                 minFlowGram = j;
2838                                                 minFlowValue = adF[j];
2839                                         }
2840                                 }
2841                                 
2842                                 if(centroids[i] != anL[minFlowGram]){
2843                                         change[i] = 1;
2844                                         centroids[i] = anL[minFlowGram];
2845                                 }
2846                         }
2847                         else if(centroids[i] != -1){
2848                                 change[i] = 1;
2849                                 centroids[i] = -1;                      
2850                         }
2851                 }
2852         
2853         return 0;
2854         }
2855         catch(exception& e) {
2856                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2857                 exit(1);        
2858         }               
2859 }
2860 /**************************************************************************************************/
2861
2862 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2863                                         vector<short>& flowDataIntI, int numFlowCells){
2864         try{
2865                 
2866                 int flowAValue = cent * numFlowCells;
2867                 int flowBValue = flow * numFlowCells;
2868                 
2869                 double dist = 0;
2870         
2871                 for(int i=0;i<length;i++){
2872                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2873                         flowAValue++;
2874                         flowBValue++;
2875                 }
2876                 
2877                 return dist / (double)length;
2878         }
2879         catch(exception& e) {
2880                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2881                 exit(1);        
2882         }               
2883 }
2884 /**************************************************************************************************/
2885
2886 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2887         try{
2888                 
2889                 double maxChange = 0;
2890                 
2891                 for(int i=0;i<numOTUs;i++){
2892                         
2893                         if (m->control_pressed) { break; }
2894                         
2895                         double difference = weight[i];
2896                         weight[i] = 0;
2897                         
2898                         for(int j=0;j<nSeqsPerOTU[i];j++){
2899                                 int index = cumNumSeqs[i] + j;
2900                                 double tauValue = singleTau[seqNumber[index]];
2901                                 weight[i] += tauValue;
2902                         }
2903                         
2904                         difference = fabs(weight[i] - difference);
2905                         if(difference > maxChange){     maxChange = difference; }
2906                 }
2907                 return maxChange;
2908         }
2909         catch(exception& e) {
2910                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2911                 exit(1);        
2912         }               
2913 }
2914
2915 /**************************************************************************************************/
2916
2917 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2918         
2919         try{
2920                 
2921                 vector<long double> P(numSeqs, 0);
2922                 int effNumOTUs = 0;
2923                 
2924                 for(int i=0;i<numOTUs;i++){
2925                         if(weight[i] > MIN_WEIGHT){
2926                                 effNumOTUs++;
2927                         }
2928                 }
2929                 
2930                 string hold;
2931                 for(int i=0;i<numOTUs;i++){
2932                         
2933                         if (m->control_pressed) { break; }
2934                         
2935                         for(int j=0;j<nSeqsPerOTU[i];j++){
2936                                 int index = cumNumSeqs[i] + j;
2937                                 int nI = seqIndex[index];
2938                                 double singleDist = dist[seqNumber[index]];
2939                                 
2940                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2941                         }
2942                 }
2943                 double nLL = 0.00;
2944                 for(int i=0;i<numSeqs;i++){
2945                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2946             
2947                         nLL += -log(P[i]);
2948                 }
2949                 
2950                 nLL = nLL -(double)numSeqs * log(sigma);
2951         
2952                 return nLL; 
2953         }
2954         catch(exception& e) {
2955                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2956                 exit(1);        
2957         }               
2958 }
2959
2960 /**************************************************************************************************/
2961
2962 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2963         try{
2964                 vector<int> unique(numOTUs, 1);
2965                 
2966                 for(int i=0;i<numOTUs;i++){
2967                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2968                                 unique[i] = -1;
2969                         }
2970                 }
2971                 
2972                 for(int i=0;i<numOTUs;i++){
2973                         
2974                         if (m->control_pressed) { break; }
2975                         
2976                         if(unique[i] == 1){
2977                                 for(int j=i+1;j<numOTUs;j++){
2978                                         if(unique[j] == 1){
2979                                                 
2980                                                 if(centroids[j] == centroids[i]){
2981                                                         unique[j] = 0;
2982                                                         centroids[j] = -1;
2983                                                         
2984                                                         weight[i] += weight[j];
2985                                                         weight[j] = 0.0;
2986                                                 }
2987                                         }
2988                                 }
2989                         }
2990                 }
2991         
2992         return 0;
2993         }
2994         catch(exception& e) {
2995                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2996                 exit(1);        
2997         }               
2998 }
2999 /**************************************************************************************************/
3000
3001 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3002                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3003                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3004                                      vector<int>& seqNumber, vector<int>& seqIndex,
3005                                      vector<short>& uniqueFlowgrams,
3006                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3007         
3008         try{
3009                 
3010                 int total = 0;
3011                 vector<double> newTau(numOTUs,0);
3012                 vector<double> norms(numSeqs, 0);
3013                 nSeqsPerOTU.assign(numOTUs, 0);
3014         
3015                 for(int i=0;i<numSeqs;i++){
3016                         
3017                         if (m->control_pressed) { break; }
3018                         
3019                         int indexOffset = i * numOTUs;
3020             
3021                         double offset = 1e8;
3022                         
3023                         for(int j=0;j<numOTUs;j++){
3024                 
3025                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3026                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3027                                 }
3028                 
3029                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3030                                         offset = dist[indexOffset + j];
3031                                 }
3032                         }
3033             
3034                         for(int j=0;j<numOTUs;j++){
3035                                 if(weight[j] > MIN_WEIGHT){
3036                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3037                                         norms[i] += newTau[j];
3038                                 }
3039                                 else{
3040                                         newTau[j] = 0.0;
3041                                 }
3042                         }
3043             
3044                         for(int j=0;j<numOTUs;j++){
3045                                 newTau[j] /= norms[i];
3046                         }
3047             
3048                         for(int j=0;j<numOTUs;j++){
3049                                 if(newTau[j] > MIN_TAU){
3050                                         
3051                                         int oldTotal = total;
3052                                         
3053                                         total++;
3054                                         
3055                                         singleTau.resize(total, 0);
3056                                         seqNumber.resize(total, 0);
3057                                         seqIndex.resize(total, 0);
3058                                         
3059                                         singleTau[oldTotal] = newTau[j];
3060                                         
3061                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3062                                         aaI[j][nSeqsPerOTU[j]] = i;
3063                                         nSeqsPerOTU[j]++;
3064                                 }
3065                         }
3066             
3067                 }
3068         
3069         }
3070         catch(exception& e) {
3071                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3072                 exit(1);        
3073         }               
3074 }
3075 /**************************************************************************************************/
3076
3077 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3078         try {
3079                 int index = 0;
3080                 for(int i=0;i<numOTUs;i++){
3081                         
3082                         if (m->control_pressed) { return 0; }
3083                         
3084                         cumNumSeqs[i] = index;
3085                         for(int j=0;j<nSeqsPerOTU[i];j++){
3086                                 seqNumber[index] = aaP[i][j];
3087                                 seqIndex[index] = aaI[i][j];
3088                                 
3089                                 index++;
3090                         }
3091                 }
3092         
3093         return 0; 
3094         }
3095         catch(exception& e) {
3096                 m->errorOut(e, "ShhherCommand", "fill");
3097                 exit(1);        
3098         }               
3099 }
3100 /**************************************************************************************************/
3101
3102 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3103                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3104         
3105         try {
3106                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3107                 
3108                 for(int i=0;i<numOTUs;i++){
3109                         
3110                         if (m->control_pressed) { break; }
3111                         
3112                         for(int j=0;j<nSeqsPerOTU[i];j++){
3113                                 int index = cumNumSeqs[i] + j;
3114                                 double tauValue = singleTau[seqNumber[index]];
3115                                 int sIndex = seqIndex[index];
3116                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3117                         }
3118                 }
3119                 
3120                 for(int i=0;i<numSeqs;i++){
3121                         double maxTau = -1.0000;
3122                         int maxOTU = -1;
3123                         for(int j=0;j<numOTUs;j++){
3124                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3125                                         maxTau = bigTauMatrix[i * numOTUs + j];
3126                                         maxOTU = j;
3127                                 }
3128                         }
3129                         
3130                         otuData[i] = maxOTU;
3131                 }
3132                 
3133                 nSeqsPerOTU.assign(numOTUs, 0);         
3134                 
3135                 for(int i=0;i<numSeqs;i++){
3136                         int index = otuData[i];
3137                         
3138                         singleTau[i] = 1.0000;
3139                         dist[i] = 0.0000;
3140                         
3141                         aaP[index][nSeqsPerOTU[index]] = i;
3142                         aaI[index][nSeqsPerOTU[index]] = i;
3143                         
3144                         nSeqsPerOTU[index]++;
3145                 }
3146         
3147                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3148         }
3149         catch(exception& e) {
3150                 m->errorOut(e, "ShhherCommand", "setOTUs");
3151                 exit(1);        
3152         }               
3153 }
3154 /**************************************************************************************************/
3155
3156 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3157                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3158                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3159         
3160         try {
3161         
3162                 ofstream qualityFile;
3163                 m->openOutputFile(qualityFileName, qualityFile);
3164         
3165                 qualityFile.setf(ios::fixed, ios::floatfield);
3166                 qualityFile.setf(ios::showpoint);
3167                 qualityFile << setprecision(6);
3168                 
3169                 vector<vector<int> > qualities(numOTUs);
3170                 vector<double> pr(HOMOPS, 0);
3171                 
3172                 
3173                 for(int i=0;i<numOTUs;i++){
3174                         
3175                         if (m->control_pressed) { break; }
3176                         
3177                         int index = 0;
3178                         int base = 0;
3179                         
3180                         if(nSeqsPerOTU[i] > 0){
3181                                 qualities[i].assign(1024, -1);
3182                                 
3183                                 while(index < numFlowCells){
3184                                         double maxPrValue = 1e8;
3185                                         short maxPrIndex = -1;
3186                                         double count = 0.0000;
3187                                         
3188                                         pr.assign(HOMOPS, 0);
3189                                         
3190                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3191                                                 int lIndex = cumNumSeqs[i] + j;
3192                                                 double tauValue = singleTau[seqNumber[lIndex]];
3193                                                 int sequenceIndex = aaI[i][j];
3194                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3195                                                 
3196                                                 count += tauValue;
3197                                                 
3198                                                 for(int s=0;s<HOMOPS;s++){
3199                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3200                                                 }
3201                                         }
3202                                         
3203                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3204                                         maxPrValue = pr[maxPrIndex];
3205                                         
3206                                         if(count > MIN_COUNT){
3207                                                 double U = 0.0000;
3208                                                 double norm = 0.0000;
3209                                                 
3210                                                 for(int s=0;s<HOMOPS;s++){
3211                                                         norm += exp(-(pr[s] - maxPrValue));
3212                                                 }
3213                                                 
3214                                                 for(int s=1;s<=maxPrIndex;s++){
3215                                                         int value = 0;
3216                                                         double temp = 0.0000;
3217                                                         
3218                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3219                                                         
3220                                                         if(U>0.00){
3221                                                                 temp = log10(U);
3222                                                         }
3223                                                         else{
3224                                                                 temp = -10.1;
3225                                                         }
3226                                                         temp = floor(-10 * temp);
3227                                                         value = (int)floor(temp);
3228                                                         if(value > 100){        value = 100;    }
3229                                                         
3230                                                         qualities[i][base] = (int)value;
3231                                                         base++;
3232                                                 }
3233                                         }
3234                                         
3235                                         index++;
3236                                 }
3237                         }
3238                         
3239                         
3240                         if(otuCounts[i] > 0){
3241                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3242                         
3243                                 int j=4;        //need to get past the first four bases
3244                                 while(qualities[i][j] != -1){
3245                     qualityFile << qualities[i][j] << ' ';
3246                     if (j > qualities[i].size()) { break; }
3247                     j++;
3248                                 }
3249                                 qualityFile << endl;
3250                         }
3251                 }
3252                 qualityFile.close();
3253                 outputNames.push_back(qualityFileName);
3254         
3255         }
3256         catch(exception& e) {
3257                 m->errorOut(e, "ShhherCommand", "writeQualities");
3258                 exit(1);        
3259         }               
3260 }
3261
3262 /**************************************************************************************************/
3263
3264 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3265         try {
3266                 
3267                 ofstream fastaFile;
3268                 m->openOutputFile(fastaFileName, fastaFile);
3269                 
3270                 vector<string> names(numOTUs, "");
3271                 
3272                 for(int i=0;i<numOTUs;i++){
3273                         
3274                         if (m->control_pressed) { break; }
3275                         
3276                         int index = centroids[i];
3277                         
3278                         if(otuCounts[i] > 0){
3279                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3280                                 
3281                                 string newSeq = "";
3282                                 
3283                                 for(int j=0;j<numFlowCells;j++){
3284                                         
3285                                         char base = flowOrder[j % 4];
3286                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3287                                                 newSeq += base;
3288                                         }
3289                                 }
3290                                 
3291                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3292                 else {  fastaFile << "NNNN" << endl;  }
3293                         }
3294                 }
3295                 fastaFile.close();
3296         
3297                 outputNames.push_back(fastaFileName);
3298         
3299                 if(thisCompositeFASTAFileName != ""){
3300                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3301                 }
3302         }
3303         catch(exception& e) {
3304                 m->errorOut(e, "ShhherCommand", "writeSequences");
3305                 exit(1);        
3306         }               
3307 }
3308
3309 /**************************************************************************************************/
3310
3311 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3312         try {
3313                 
3314                 ofstream nameFile;
3315                 m->openOutputFile(nameFileName, nameFile);
3316                 
3317                 for(int i=0;i<numOTUs;i++){
3318                         
3319                         if (m->control_pressed) { break; }
3320                         
3321                         if(otuCounts[i] > 0){
3322                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3323                                 
3324                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3325                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3326                                 }
3327                                 
3328                                 nameFile << endl;
3329                         }
3330                 }
3331                 nameFile.close();
3332                 outputNames.push_back(nameFileName);
3333                 
3334                 
3335                 if(thisCompositeNamesFileName != ""){
3336                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3337                 }               
3338         }
3339         catch(exception& e) {
3340                 m->errorOut(e, "ShhherCommand", "writeNames");
3341                 exit(1);        
3342         }               
3343 }
3344
3345 /**************************************************************************************************/
3346
3347 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3348         try {
3349         ofstream groupFile;
3350                 m->openOutputFile(groupFileName, groupFile);
3351                 
3352                 for(int i=0;i<numSeqs;i++){
3353                         if (m->control_pressed) { break; }
3354                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3355                 }
3356                 groupFile.close();
3357                 outputNames.push_back(groupFileName);
3358         
3359         }
3360         catch(exception& e) {
3361                 m->errorOut(e, "ShhherCommand", "writeGroups");
3362                 exit(1);        
3363         }               
3364 }
3365
3366 /**************************************************************************************************/
3367
3368 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3369         try {
3370                 ofstream otuCountsFile;
3371                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3372                 
3373                 string bases = flowOrder;
3374                 
3375                 for(int i=0;i<numOTUs;i++){
3376                         
3377                         if (m->control_pressed) {
3378                                 break;
3379                         }
3380                         //output the translated version of the centroid sequence for the otu
3381                         if(otuCounts[i] > 0){
3382                                 int index = centroids[i];
3383                                 
3384                                 otuCountsFile << "ideal\t";
3385                                 for(int j=8;j<numFlowCells;j++){
3386                                         char base = bases[j % 4];
3387                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3388                                                 otuCountsFile << base;
3389                                         }
3390                                 }
3391                                 otuCountsFile << endl;
3392                                 
3393                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3394                                         int sequence = aaI[i][j];
3395                                         otuCountsFile << seqNameVector[sequence] << '\t';
3396                                         
3397                                         string newSeq = "";
3398                                         
3399                                         for(int k=0;k<lengths[sequence];k++){
3400                                                 char base = bases[k % 4];
3401                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3402                         
3403                                                 for(int s=0;s<freq;s++){
3404                                                         newSeq += base;
3405                                                         //otuCountsFile << base;
3406                                                 }
3407                                         }
3408                                         
3409                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3410                     else {  otuCountsFile << "NNNN" << endl;  }
3411                                 }
3412                                 otuCountsFile << endl;
3413                         }
3414                 }
3415                 otuCountsFile.close();
3416                 outputNames.push_back(otuCountsFileName);
3417         
3418         }
3419         catch(exception& e) {
3420                 m->errorOut(e, "ShhherCommand", "writeClusters");
3421                 exit(1);        
3422         }               
3423 }
3424
3425 /**************************************************************************************************/
3426
3427 void ShhherCommand::getSingleLookUp(){
3428         try{
3429                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3430                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3431                 
3432                 int index = 0;
3433                 ifstream lookUpFile;
3434                 m->openInputFile(lookupFileName, lookUpFile);
3435                 
3436                 for(int i=0;i<HOMOPS;i++){
3437                         
3438                         if (m->control_pressed) { break; }
3439                         
3440                         float logFracFreq;
3441                         lookUpFile >> logFracFreq;
3442                         
3443                         for(int j=0;j<NUMBINS;j++)      {
3444                                 lookUpFile >> singleLookUp[index];
3445                                 index++;
3446                         }
3447                 }       
3448                 lookUpFile.close();
3449         }
3450         catch(exception& e) {
3451                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3452                 exit(1);
3453         }
3454 }
3455
3456 /**************************************************************************************************/
3457
3458 void ShhherCommand::getJointLookUp(){
3459         try{
3460                 
3461                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3462                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3463                 
3464                 for(int i=0;i<NUMBINS;i++){
3465                         
3466                         if (m->control_pressed) { break; }
3467                         
3468                         for(int j=0;j<NUMBINS;j++){             
3469                                 
3470                                 double minSum = 100000000;
3471                                 
3472                                 for(int k=0;k<HOMOPS;k++){
3473                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3474                                         
3475                                         if(sum < minSum)        {       minSum = sum;           }
3476                                 }       
3477                                 jointLookUp[i * NUMBINS + j] = minSum;
3478                         }
3479                 }
3480         }
3481         catch(exception& e) {
3482                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3483                 exit(1);
3484         }
3485 }
3486
3487 /**************************************************************************************************/
3488
3489 double ShhherCommand::getProbIntensity(int intIntensity){                          
3490         try{
3491
3492                 double minNegLogProb = 100000000; 
3493
3494                 
3495                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3496                         
3497                         if (m->control_pressed) { break; }
3498                         
3499                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3500                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3501                 }
3502                 
3503                 return minNegLogProb;
3504         }
3505         catch(exception& e) {
3506                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3507                 exit(1);
3508         }
3509 }
3510
3511
3512
3513