]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added abundance and sample parameters to get.coremicrobiome command.
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
21                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
22                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
23                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
24                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41                 return helpString;
42         }
43         catch(exception& e) {
44                 m->errorOut(e, "ShhherCommand", "getHelpString");
45                 exit(1);
46         }
47 }
48 //**********************************************************************************************************************
49
50 ShhherCommand::ShhherCommand(){ 
51         try {
52                 abort = true; calledHelp = true;
53                 setParameters();
54                 
55                 //initialize outputTypes
56 //              vector<string> tempOutNames;
57 //              outputTypes["pn.dist"] = tempOutNames;
58
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
62                 exit(1);
63         }
64 }
65
66 //**********************************************************************************************************************
67
68 ShhherCommand::ShhherCommand(string option) {
69         try {
70         
71 #ifdef USE_MPI
72                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
73                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
74         
75                 if(pid == 0){
76 #endif
77                 abort = false; calledHelp = false;   
78                 
79                 //allow user to run help
80                 if(option == "help") { help(); abort = true; calledHelp = true; }
81                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
82                 
83                 else {
84                         vector<string> myArray = setParameters();
85                         
86                         OptionParser parser(option);
87                         map<string,string> parameters = parser.getParameters();
88                         
89                         ValidParameters validParameter;
90                         map<string,string>::iterator it;
91                         
92                         //check to make sure all parameters are valid for command
93                         for (it = parameters.begin(); it != parameters.end(); it++) { 
94                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
95                         }
96                         
97                         //initialize outputTypes
98                         vector<string> tempOutNames;
99 //                      outputTypes["pn.dist"] = tempOutNames;
100                         //                      outputTypes["fasta"] = tempOutNames;
101                         
102                         //if the user changes the input directory command factory will send this info to us in the output parameter 
103                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
104                         if (inputDir == "not found"){   inputDir = "";          }
105                         else {
106                                 string path;
107                                 it = parameters.find("flow");
108                                 //user has given a template file
109                                 if(it != parameters.end()){ 
110                                         path = m->hasPath(it->second);
111                                         //if the user has not given a path then, add inputdir. else leave path alone.
112                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
113                                 }
114                                 
115                                 it = parameters.find("lookup");
116                                 //user has given a template file
117                                 if(it != parameters.end()){ 
118                                         path = m->hasPath(it->second);
119                                         //if the user has not given a path then, add inputdir. else leave path alone.
120                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
121                                 }
122
123                                 it = parameters.find("file");
124                                 //user has given a template file
125                                 if(it != parameters.end()){ 
126                                         path = m->hasPath(it->second);
127                                         //if the user has not given a path then, add inputdir. else leave path alone.
128                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
129                                 }
130                         }
131             
132             //if the user changes the output directory command factory will send this info to us in the output parameter 
133                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
134             
135                         //check for required parameters
136                         flowFileName = validParameter.validFile(parameters, "flow", true);
137                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
138                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
139                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
140                                 m->mothurOutEndLine();
141                                 abort = true; 
142                         }
143                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
144                         
145                         if(flowFileName != "not found"){
146                                 compositeFASTAFileName = "";    
147                                 compositeNamesFileName = "";    
148                         }
149                         else{
150                                 ofstream temp;
151                 
152                 string thisoutputDir = m->hasPath(flowFilesFileName); //if user entered a file with a path then preserve it
153                 
154                                 //flow.files = 9 character offset
155                                 compositeFASTAFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.fasta";
156                                 m->openOutputFile(compositeFASTAFileName, temp);
157                                 temp.close();
158                                 
159                                 compositeNamesFileName = thisoutputDir + m->getRootName(m->getSimpleName(flowFilesFileName)) + "shhh.names";
160                                 m->openOutputFile(compositeNamesFileName, temp);
161                                 temp.close();
162                         }
163             
164             if(flowFilesFileName != "not found"){
165                 string fName;
166                 
167                 ifstream flowFilesFile;
168                 m->openInputFile(flowFilesFileName, flowFilesFile);
169                 while(flowFilesFile){
170                     fName = m->getline(flowFilesFile);
171                     
172                     //test if file is valid
173                     ifstream in;
174                     int ableToOpen = m->openInputFile(fName, in, "noerror");
175                     in.close(); 
176                     if (ableToOpen == 1) {
177                         if (inputDir != "") { //default path is set
178                             string tryPath = inputDir + fName;
179                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
180                             ifstream in2;
181                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
182                             in2.close();
183                             fName = tryPath;
184                         }
185                     }
186                     
187                     if (ableToOpen == 1) {
188                         if (m->getDefaultPath() != "") { //default path is set
189                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
190                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
191                             ifstream in2;
192                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
193                             in2.close();
194                             fName = tryPath;
195                         }
196                     }
197                     
198                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
199                     if (ableToOpen == 1) {
200                         string exepath = m->argv;
201                         string tempPath = exepath;
202                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
203                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
204                         
205                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
206                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
207                         ifstream in2;
208                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
209                         in2.close();
210                         fName = tryPath;
211                     }
212                     
213                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
214                     else { flowFileVector.push_back(fName); }
215                     m->gobble(flowFilesFile);
216                 }
217                 flowFilesFile.close();
218                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
219             }
220             else{
221                 outputDir += m->hasPath(flowFileName);
222                 flowFileVector.push_back(flowFileName);
223             }
224                 
225                         //check for optional parameter and set defaults
226                         // ...at some point should added some additional type checking...
227                         string temp;
228                         temp = validParameter.validFile(parameters, "lookup", true);
229                         if (temp == "not found")        {       
230                                 lookupFileName = "LookUp_Titanium.pat"; 
231                                 
232                                 int ableToOpen;
233                                 ifstream in;
234                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
235                                 in.close();     
236                                 
237                                 //if you can't open it, try input location
238                                 if (ableToOpen == 1) {
239                                         if (inputDir != "") { //default path is set
240                                                 string tryPath = inputDir + lookupFileName;
241                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
242                                                 ifstream in2;
243                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
244                                                 in2.close();
245                                                 lookupFileName = tryPath;
246                                         }
247                                 }
248                                 
249                                 //if you can't open it, try default location
250                                 if (ableToOpen == 1) {
251                                         if (m->getDefaultPath() != "") { //default path is set
252                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
253                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
254                                                 ifstream in2;
255                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
256                                                 in2.close();
257                                                 lookupFileName = tryPath;
258                                         }
259                                 }
260                                 
261                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
262                                 if (ableToOpen == 1) {
263                                         string exepath = m->argv;
264                                         string tempPath = exepath;
265                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
266                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
267                                         
268                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
269                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
270                                         ifstream in2;
271                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
272                                         in2.close();
273                                         lookupFileName = tryPath;
274                                 }
275                                 
276                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
277                         }
278                         else if(temp == "not open")     {       
279                                 
280                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
281                                 
282                                 //if you can't open it its not inputDir, try mothur excutable location
283                                 string exepath = m->argv;
284                                 string tempPath = exepath;
285                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
286                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
287                                         
288                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
289                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
290                                 ifstream in2;
291                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
292                                 in2.close();
293                                 lookupFileName = tryPath;
294                                 
295                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
296                         }else                                           {       lookupFileName = temp;  }
297                         
298                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
299                         m->setProcessors(temp);
300                         m->mothurConvert(temp, processors);
301
302                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
303                         m->mothurConvert(temp, cutoff); 
304                         
305                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
306                         m->mothurConvert(temp, minDelta); 
307
308                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
309                         m->mothurConvert(temp, maxIters); 
310
311                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
312                         m->mothurConvert(temp, sigma); 
313                         
314                         flowOrder = validParameter.validFile(parameters, "order", false);
315                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
316                         else if(flowOrder.length() != 4){
317                                 m->mothurOut("The value of the order option must be four bases long\n");
318                         }
319                         
320                 }
321 #ifdef USE_MPI
322                 }                               
323 #endif
324         }
325         catch(exception& e) {
326                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
327                 exit(1);
328         }
329 }
330 //**********************************************************************************************************************
331 #ifdef USE_MPI
332 int ShhherCommand::execute(){
333         try {
334                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
335                 
336                 int tag = 1976;
337                 MPI_Status status; 
338                         
339                 if(pid == 0){
340
341                         for(int i=1;i<ncpus;i++){
342                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
343                         }
344                         if(abort == 1){ return 0;       }
345
346                         processors = ncpus;
347                         
348                         m->mothurOut("\nGetting preliminary data...\n");
349                         getSingleLookUp();      if (m->control_pressed) { return 0; }
350                         getJointLookUp();       if (m->control_pressed) { return 0; }
351                         
352             vector<string> flowFileVector;
353                         if(flowFilesFileName != "not found"){
354                                 string fName;
355                 
356                                 ifstream flowFilesFile;
357                                 m->openInputFile(flowFilesFileName, flowFilesFile);
358                                 while(flowFilesFile){
359                                         fName = m->getline(flowFilesFile);
360                                         flowFileVector.push_back(fName);
361                                         m->gobble(flowFilesFile);
362                                 }
363                         }
364                         else{
365                                 flowFileVector.push_back(flowFileName);
366                         }
367             
368                         int numFiles = flowFileVector.size();
369
370                         for(int i=1;i<ncpus;i++){
371                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
372                         }
373                         
374                         for(int i=0;i<numFiles;i++){
375                                 
376                                 if (m->control_pressed) { break; }
377                                 
378                                 double begClock = clock();
379                                 unsigned long long begTime = time(NULL);
380
381                                 flowFileName = flowFileVector[i];
382                                 
383                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
384                                 m->mothurOut("Reading flowgrams...\n");
385                                 getFlowData();
386                                 
387                                 if (m->control_pressed) { break; }
388
389                                 m->mothurOut("Identifying unique flowgrams...\n");
390                                 getUniques();
391                                 
392                                 if (m->control_pressed) { break; }
393
394                                 m->mothurOut("Calculating distances between flowgrams...\n");
395                                 char fileName[1024];
396                                 strcpy(fileName, flowFileName.c_str());
397
398                                 for(int i=1;i<ncpus;i++){
399                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
400
401                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
402                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
403                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
404                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
405                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
406                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
407                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
408                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
409                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
410                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
411                                 }                               
412                                                         
413                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
414                                 
415                                 if (m->control_pressed) { break; }
416                                 
417                                 int done;
418                                 for(int i=1;i<ncpus;i++){
419                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
420                                         
421                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
422                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
423                                 }
424
425                                 string namesFileName = createNamesFile();
426                                 
427                                 if (m->control_pressed) { break; }
428                                 
429                                 m->mothurOut("\nClustering flowgrams...\n");
430                                 string listFileName = cluster(distFileName, namesFileName);
431                                 
432                                 if (m->control_pressed) { break; }
433                                 
434                                 getOTUData(listFileName);
435
436                                 m->mothurRemove(distFileName);
437                                 m->mothurRemove(namesFileName);
438                                 m->mothurRemove(listFileName);
439                                 
440                                 if (m->control_pressed) { break; }
441                                 
442                                 initPyroCluster();
443
444                                 if (m->control_pressed) { break; }
445                                 
446                                 for(int i=1;i<ncpus;i++){
447                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
448                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
449                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
450                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
451                                 }
452                                 
453                                 if (m->control_pressed) { break; }
454                                 
455                                 double maxDelta = 0;
456                                 int iter = 0;
457                                 
458                                 int numOTUsOnCPU = numOTUs / ncpus;
459                                 int numSeqsOnCPU = numSeqs / ncpus;
460                                 m->mothurOut("\nDenoising flowgrams...\n");
461                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
462                                 
463                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
464                                         
465                                         double cycClock = clock();
466                                         unsigned long long cycTime = time(NULL);
467                                         fill();
468                                         
469                                         if (m->control_pressed) { break; }
470
471                                         int total = singleTau.size();
472                                         for(int i=1;i<ncpus;i++){
473                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
474                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
475                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
476                                                 
477                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
478                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
479                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
480                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
481                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
482                                         }
483                                         
484                                         calcCentroidsDriver(0, numOTUsOnCPU);
485                                         
486                                         for(int i=1;i<ncpus;i++){
487                                                 int otuStart = i * numOTUs / ncpus;
488                                                 int otuStop = (i + 1) * numOTUs / ncpus;
489                                                 
490                                                 vector<int> tempCentroids(numOTUs, 0);
491                                                 vector<short> tempChange(numOTUs, 0);
492                                                 
493                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
494                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
495                                                 
496                                                 for(int j=otuStart;j<otuStop;j++){
497                                                         centroids[j] = tempCentroids[j];
498                                                         change[j] = tempChange[j];
499                                                 }
500                                         }
501                                                                         
502                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
503                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
504                                         checkCentroids(); if (m->control_pressed) { break; }
505                                         
506                                         for(int i=1;i<ncpus;i++){
507                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
508                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
509                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
510                                         }
511                                         
512                                         calcNewDistancesParent(0, numSeqsOnCPU);
513
514                                         total = singleTau.size();
515
516                                         for(int i=1;i<ncpus;i++){
517                                                 int childTotal;
518                                                 int seqStart = i * numSeqs / ncpus;
519                                                 int seqStop = (i + 1) * numSeqs / ncpus;
520                                                 
521                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
522
523                                                 vector<int> childSeqIndex(childTotal, 0);
524                                                 vector<double> childSingleTau(childTotal, 0);
525                                                 vector<double> childDist(numSeqs * numOTUs, 0);
526                                                 vector<int> otuIndex(childTotal, 0);
527                                                 
528                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
529                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
530                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
531                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
532
533                                                 int oldTotal = total;
534                                                 total += childTotal;
535                                                 singleTau.resize(total, 0);
536                                                 seqIndex.resize(total, 0);
537                                                 seqNumber.resize(total, 0);
538                                                 
539                                                 int childIndex = 0;
540                                                 
541                                                 for(int j=oldTotal;j<total;j++){
542                                                         int otuI = otuIndex[childIndex];
543                                                         int seqI = childSeqIndex[childIndex];
544                                                         
545                                                         singleTau[j] = childSingleTau[childIndex];
546                                                         
547                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
548                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
549                                                         nSeqsPerOTU[otuI]++;
550                                                         childIndex++;
551                                                 }
552                                                 
553                                                 int index = seqStart * numOTUs;
554                                                 for(int j=seqStart;j<seqStop;j++){
555                                                         for(int k=0;k<numOTUs;k++){
556                                                                 dist[index] = childDist[index];
557                                                                 index++;
558                                                         }
559                                                 }                                       
560                                         }
561                                         
562                                         iter++;
563                                         
564                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
565
566                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
567                                                 int live = 1;
568                                                 for(int i=1;i<ncpus;i++){
569                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
570                                                 }
571                                         }
572                                         else{
573                                                 int live = 0;
574                                                 for(int i=1;i<ncpus;i++){
575                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
576                                                 }
577                                         }
578                                         
579                                 }       
580                                 
581                                 if (m->control_pressed) { break; }
582                                 
583                                 m->mothurOut("\nFinalizing...\n");
584                                 fill();
585                                 
586                                 if (m->control_pressed) { break; }
587                                 
588                                 setOTUs();
589                                 
590                                 vector<int> otuCounts(numOTUs, 0);
591                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
592                                 calcCentroidsDriver(0, numOTUs);
593                                 
594                                 if (m->control_pressed) { break; }
595                                 
596                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
597                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
598                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
599                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
600                                 writeGroups();                          if (m->control_pressed) { break; }
601                                 
602                                                                  
603                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
604                         }
605                 }
606                 else{
607                         int abort = 1;
608
609                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
610                         if(abort){      return 0;       }
611
612                         int numFiles;
613                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
614
615                         for(int i=0;i<numFiles;i++){
616                                 
617                                 if (m->control_pressed) { break; }
618                                 
619                                 //Now into the pyrodist part
620                                 bool live = 1;
621
622                                 char fileName[1024];
623                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
624                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
625                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
626                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
627                                 
628                                 flowDataIntI.resize(numSeqs * numFlowCells);
629                                 flowDataPrI.resize(numSeqs * numFlowCells);
630                                 mapUniqueToSeq.resize(numSeqs);
631                                 mapSeqToUnique.resize(numSeqs);
632                                 lengths.resize(numSeqs);
633                                 jointLookUp.resize(NUMBINS * NUMBINS);
634
635                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
636                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
637                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
638                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
639                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
640                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
641                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
642
643                                 flowFileName = string(fileName);
644                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
645                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
646                                 
647                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
648                                 
649                                 if (m->control_pressed) { break; }
650                                 
651                                 int done = 1;
652                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
653
654                                 //Now into the pyronoise part
655                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
656                                 
657                                 singleLookUp.resize(HOMOPS * NUMBINS);
658                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
659                                 weight.resize(numOTUs);
660                                 centroids.resize(numOTUs);
661                                 change.resize(numOTUs);
662                                 dist.assign(numOTUs * numSeqs, 0);
663                                 nSeqsPerOTU.resize(numOTUs);
664                                 cumNumSeqs.resize(numOTUs);
665
666                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
667                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
668                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
669                                 
670                                 int startOTU = pid * numOTUs / ncpus;
671                                 int endOTU = (pid + 1) * numOTUs / ncpus;
672
673                                 int startSeq = pid * numSeqs / ncpus;
674                                 int endSeq = (pid + 1) * numSeqs /ncpus;
675                                 
676                                 int total;
677
678                                 while(live){
679                                         
680                                         if (m->control_pressed) { break; }
681                                         
682                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
683                                         singleTau.assign(total, 0.0000);
684                                         seqNumber.assign(total, 0);
685                                         seqIndex.assign(total, 0);
686
687                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
688                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
689                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
690                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
691                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
692                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
693                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
694
695                                         calcCentroidsDriver(startOTU, endOTU);
696
697                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
698                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
699
700                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
701                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
702                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
703
704                                         vector<int> otuIndex(total, 0);
705                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
706                                         total = otuIndex.size();
707                                         
708                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
709                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
710                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
711                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
712                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
713                                 
714                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
715                                 }
716                         }
717                 }
718                 
719                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
720                 
721                 MPI_Barrier(MPI_COMM_WORLD);
722
723                 
724                 if(compositeFASTAFileName != ""){
725                         outputNames.push_back(compositeFASTAFileName);
726                         outputNames.push_back(compositeNamesFileName);
727                 }
728
729                 m->mothurOutEndLine();
730                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
731                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
732                 m->mothurOutEndLine();
733                 
734                 return 0;
735
736         }
737         catch(exception& e) {
738                 m->errorOut(e, "ShhherCommand", "execute");
739                 exit(1);
740         }
741 }
742 /**************************************************************************************************/
743 string ShhherCommand::createNamesFile(){
744         try{
745                 
746                 vector<string> duplicateNames(numUniques, "");
747                 for(int i=0;i<numSeqs;i++){
748                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
749                 }
750                 
751                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
752                 
753                 ofstream nameFile;
754                 m->openOutputFile(nameFileName, nameFile);
755                 
756                 for(int i=0;i<numUniques;i++){
757                         
758                         if (m->control_pressed) { break; }
759                         
760             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
761                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
762                 }
763                 
764                 nameFile.close();
765                 return  nameFileName;
766         }
767         catch(exception& e) {
768                 m->errorOut(e, "ShhherCommand", "createNamesFile");
769                 exit(1);
770         }
771 }
772 /**************************************************************************************************/
773
774 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
775         try{            
776                 ostringstream outStream;
777                 outStream.setf(ios::fixed, ios::floatfield);
778                 outStream.setf(ios::dec, ios::basefield);
779                 outStream.setf(ios::showpoint);
780                 outStream.precision(6);
781                 
782                 int begTime = time(NULL);
783                 double begClock = clock();
784                 
785                 for(int i=startSeq;i<stopSeq;i++){
786                         
787                         if (m->control_pressed) { break; }
788                         
789                         for(int j=0;j<i;j++){
790                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
791                                 
792                                 if(flowDistance < 1e-6){
793                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
794                                 }
795                                 else if(flowDistance <= cutoff){
796                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
797                                 }
798                         }
799                         if(i % 100 == 0){
800                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
801                         }
802                 }
803                 
804                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
805                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
806                 
807                 if (m->control_pressed) { return fDistFileName; }
808                 
809                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
810
811                 ofstream distFile(fDistFileName.c_str());
812                 distFile << outStream.str();            
813                 distFile.close();
814                 
815                 return fDistFileName;
816         }
817         catch(exception& e) {
818                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
819                 exit(1);
820         }
821 }
822 /**************************************************************************************************/
823  
824 void ShhherCommand::getOTUData(string listFileName){
825     try {
826         
827         ifstream listFile;
828         m->openInputFile(listFileName, listFile);
829         string label;
830         
831         listFile >> label >> numOTUs;
832         
833         otuData.assign(numSeqs, 0);
834         cumNumSeqs.assign(numOTUs, 0);
835         nSeqsPerOTU.assign(numOTUs, 0);
836         aaP.clear();aaP.resize(numOTUs);
837         
838         seqNumber.clear();
839         aaI.clear();
840         seqIndex.clear();
841         
842         string singleOTU = "";
843         
844         for(int i=0;i<numOTUs;i++){
845             
846             if (m->control_pressed) { break; }
847             
848             listFile >> singleOTU;
849             
850             istringstream otuString(singleOTU);
851             
852             while(otuString){
853                 
854                 string seqName = "";
855                 
856                 for(int j=0;j<singleOTU.length();j++){
857                     char letter = otuString.get();
858                     
859                     if(letter != ','){
860                         seqName += letter;
861                     }
862                     else{
863                         map<string,int>::iterator nmIt = nameMap.find(seqName);
864                         int index = nmIt->second;
865                         
866                         nameMap.erase(nmIt);
867                         
868                         otuData[index] = i;
869                         nSeqsPerOTU[i]++;
870                         aaP[i].push_back(index);
871                         seqName = "";
872                     }
873                 }
874                 
875                 map<string,int>::iterator nmIt = nameMap.find(seqName);
876                 
877                 int index = nmIt->second;
878                 nameMap.erase(nmIt);
879                 
880                 otuData[index] = i;
881                 nSeqsPerOTU[i]++;
882                 aaP[i].push_back(index);        
883                 
884                 otuString.get();
885             }
886             
887             sort(aaP[i].begin(), aaP[i].end());
888             for(int j=0;j<nSeqsPerOTU[i];j++){
889                 seqNumber.push_back(aaP[i][j]);
890             }
891             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
892                 aaP[i].push_back(0);
893             }
894             
895             
896         }
897         
898         for(int i=1;i<numOTUs;i++){
899             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
900         }
901         aaI = aaP;
902         seqIndex = seqNumber;
903         
904         listFile.close();       
905         
906     }
907     catch(exception& e) {
908         m->errorOut(e, "ShhherCommand", "getOTUData");
909         exit(1);        
910     }           
911 }
912
913 /**************************************************************************************************/
914
915 void ShhherCommand::initPyroCluster(){                          
916     try{
917         if (numOTUs < processors) { processors = 1; }
918         
919         dist.assign(numSeqs * numOTUs, 0);
920         change.assign(numOTUs, 1);
921         centroids.assign(numOTUs, -1);
922         weight.assign(numOTUs, 0);
923         singleTau.assign(numSeqs, 1.0);
924         
925         nSeqsBreaks.assign(processors+1, 0);
926         nOTUsBreaks.assign(processors+1, 0);
927         
928         nSeqsBreaks[0] = 0;
929         for(int i=0;i<processors;i++){
930             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
931             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
932         }
933         nSeqsBreaks[processors] = numSeqs;
934         nOTUsBreaks[processors] = numOTUs;
935     }
936     catch(exception& e) {
937         m->errorOut(e, "ShhherCommand", "initPyroCluster");
938         exit(1);        
939     }
940 }
941
942 /**************************************************************************************************/
943
944 void ShhherCommand::fill(){
945     try {
946         int index = 0;
947         for(int i=0;i<numOTUs;i++){
948             
949             if (m->control_pressed) { break; }
950             
951             cumNumSeqs[i] = index;
952             for(int j=0;j<nSeqsPerOTU[i];j++){
953                 seqNumber[index] = aaP[i][j];
954                 seqIndex[index] = aaI[i][j];
955                 
956                 index++;
957             }
958         }
959     }
960     catch(exception& e) {
961         m->errorOut(e, "ShhherCommand", "fill");
962         exit(1);        
963     }           
964 }
965
966 /**************************************************************************************************/
967  
968 void ShhherCommand::getFlowData(){
969     try{
970         ifstream flowFile;
971         m->openInputFile(flowFileName, flowFile);
972         
973         string seqName;
974         seqNameVector.clear();
975         lengths.clear();
976         flowDataIntI.clear();
977         nameMap.clear();
978         
979         
980         int currentNumFlowCells;
981         
982         float intensity;
983         
984         flowFile >> numFlowCells;
985         int index = 0;//pcluster
986         while(!flowFile.eof()){
987             
988             if (m->control_pressed) { break; }
989             
990             flowFile >> seqName >> currentNumFlowCells;
991             lengths.push_back(currentNumFlowCells);
992             
993             seqNameVector.push_back(seqName);
994             nameMap[seqName] = index++;//pcluster
995             
996             for(int i=0;i<numFlowCells;i++){
997                 flowFile >> intensity;
998                 if(intensity > 9.99)    {       intensity = 9.99;       }
999                 int intI = int(100 * intensity + 0.0001);
1000                 flowDataIntI.push_back(intI);
1001             }
1002             m->gobble(flowFile);
1003         }
1004         flowFile.close();
1005         
1006         numSeqs = seqNameVector.size();         
1007         
1008         for(int i=0;i<numSeqs;i++){
1009             
1010             if (m->control_pressed) { break; }
1011             
1012             int iNumFlowCells = i * numFlowCells;
1013             for(int j=lengths[i];j<numFlowCells;j++){
1014                 flowDataIntI[iNumFlowCells + j] = 0;
1015             }
1016         }
1017         
1018     }
1019     catch(exception& e) {
1020         m->errorOut(e, "ShhherCommand", "getFlowData");
1021         exit(1);
1022     }
1023 }
1024 /**************************************************************************************************/
1025 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1026         
1027         try{
1028                 vector<double> newTau(numOTUs,0);
1029                 vector<double> norms(numSeqs, 0);
1030                 otuIndex.clear();
1031                 seqIndex.clear();
1032                 singleTau.clear();
1033                 
1034                 for(int i=startSeq;i<stopSeq;i++){
1035                         
1036                         if (m->control_pressed) { break; }
1037                         
1038                         double offset = 1e8;
1039                         int indexOffset = i * numOTUs;
1040                         
1041                         for(int j=0;j<numOTUs;j++){
1042                                 
1043                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1044                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1045                                 }
1046                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1047                                         offset = dist[indexOffset + j];
1048                                 }
1049                         }
1050                         
1051                         for(int j=0;j<numOTUs;j++){
1052                                 if(weight[j] > MIN_WEIGHT){
1053                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1054                                         norms[i] += newTau[j];
1055                                 }
1056                                 else{
1057                                         newTau[j] = 0.0;
1058                                 }
1059                         }
1060                         
1061                         for(int j=0;j<numOTUs;j++){
1062                 
1063                                 newTau[j] /= norms[i];
1064                                 
1065                                 if(newTau[j] > MIN_TAU){
1066                                         otuIndex.push_back(j);
1067                                         seqIndex.push_back(i);
1068                                         singleTau.push_back(newTau[j]);
1069                                 }
1070                         }
1071                         
1072                 }
1073         }
1074         catch(exception& e) {
1075                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1076                 exit(1);        
1077         }               
1078 }
1079
1080 /**************************************************************************************************/
1081
1082 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1083     
1084     try{
1085         
1086         int total = 0;
1087         vector<double> newTau(numOTUs,0);
1088         vector<double> norms(numSeqs, 0);
1089         nSeqsPerOTU.assign(numOTUs, 0);
1090         
1091         for(int i=startSeq;i<stopSeq;i++){
1092             
1093             if (m->control_pressed) { break; }
1094             
1095             int indexOffset = i * numOTUs;
1096             
1097             double offset = 1e8;
1098             
1099             for(int j=0;j<numOTUs;j++){
1100                 
1101                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1102                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1103                 }
1104                 
1105                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1106                     offset = dist[indexOffset + j];
1107                 }
1108             }
1109             
1110             for(int j=0;j<numOTUs;j++){
1111                 if(weight[j] > MIN_WEIGHT){
1112                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1113                     norms[i] += newTau[j];
1114                 }
1115                 else{
1116                     newTau[j] = 0.0;
1117                 }
1118             }
1119             
1120             for(int j=0;j<numOTUs;j++){
1121                 newTau[j] /= norms[i];
1122             }
1123             
1124             for(int j=0;j<numOTUs;j++){
1125                 if(newTau[j] > MIN_TAU){
1126                     
1127                     int oldTotal = total;
1128                     
1129                     total++;
1130                     
1131                     singleTau.resize(total, 0);
1132                     seqNumber.resize(total, 0);
1133                     seqIndex.resize(total, 0);
1134                     
1135                     singleTau[oldTotal] = newTau[j];
1136                     
1137                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1138                     aaI[j][nSeqsPerOTU[j]] = i;
1139                     nSeqsPerOTU[j]++;
1140                 }
1141             }
1142             
1143         }
1144         
1145     }
1146     catch(exception& e) {
1147         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1148         exit(1);        
1149     }           
1150 }
1151
1152 /**************************************************************************************************/
1153
1154 void ShhherCommand::setOTUs(){
1155     
1156     try {
1157         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1158         
1159         for(int i=0;i<numOTUs;i++){
1160             
1161             if (m->control_pressed) { break; }
1162             
1163             for(int j=0;j<nSeqsPerOTU[i];j++){
1164                 int index = cumNumSeqs[i] + j;
1165                 double tauValue = singleTau[seqNumber[index]];
1166                 int sIndex = seqIndex[index];
1167                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1168             }
1169         }
1170         
1171         for(int i=0;i<numSeqs;i++){
1172             double maxTau = -1.0000;
1173             int maxOTU = -1;
1174             for(int j=0;j<numOTUs;j++){
1175                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1176                     maxTau = bigTauMatrix[i * numOTUs + j];
1177                     maxOTU = j;
1178                 }
1179             }
1180             
1181             otuData[i] = maxOTU;
1182         }
1183         
1184         nSeqsPerOTU.assign(numOTUs, 0);         
1185         
1186         for(int i=0;i<numSeqs;i++){
1187             int index = otuData[i];
1188             
1189             singleTau[i] = 1.0000;
1190             dist[i] = 0.0000;
1191             
1192             aaP[index][nSeqsPerOTU[index]] = i;
1193             aaI[index][nSeqsPerOTU[index]] = i;
1194             
1195             nSeqsPerOTU[index]++;
1196         }
1197         fill(); 
1198     }
1199     catch(exception& e) {
1200         m->errorOut(e, "ShhherCommand", "setOTUs");
1201         exit(1);        
1202     }           
1203 }
1204
1205 /**************************************************************************************************/
1206  
1207 void ShhherCommand::getUniques(){
1208     try{
1209         
1210         
1211         numUniques = 0;
1212         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1213         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1214         uniqueLengths.assign(numSeqs, 0);
1215         mapSeqToUnique.assign(numSeqs, -1);
1216         mapUniqueToSeq.assign(numSeqs, -1);
1217         
1218         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1219         
1220         for(int i=0;i<numSeqs;i++){
1221             
1222             if (m->control_pressed) { break; }
1223             
1224             int index = 0;
1225             
1226             vector<short> current(numFlowCells);
1227             for(int j=0;j<numFlowCells;j++){
1228                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1229             }
1230             
1231             for(int j=0;j<numUniques;j++){
1232                 int offset = j * numFlowCells;
1233                 bool toEnd = 1;
1234                 
1235                 int shorterLength;
1236                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1237                 else                                                            {       shorterLength = uniqueLengths[j];       }
1238                 
1239                 for(int k=0;k<shorterLength;k++){
1240                     if(current[k] != uniqueFlowgrams[offset + k]){
1241                         toEnd = 0;
1242                         break;
1243                     }
1244                 }
1245                 
1246                 if(toEnd){
1247                     mapSeqToUnique[i] = j;
1248                     uniqueCount[j]++;
1249                     index = j;
1250                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1251                     break;
1252                 }
1253                 index++;
1254             }
1255             
1256             if(index == numUniques){
1257                 uniqueLengths[numUniques] = lengths[i];
1258                 uniqueCount[numUniques] = 1;
1259                 mapSeqToUnique[i] = numUniques;//anMap
1260                 mapUniqueToSeq[numUniques] = i;//anF
1261                 
1262                 for(int k=0;k<numFlowCells;k++){
1263                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1264                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1265                 }
1266                 
1267                 numUniques++;
1268             }
1269         }
1270         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1271         uniqueLengths.resize(numUniques);       
1272         
1273         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1274         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1275     }
1276     catch(exception& e) {
1277         m->errorOut(e, "ShhherCommand", "getUniques");
1278         exit(1);
1279     }
1280 }
1281
1282 /**************************************************************************************************/
1283
1284 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1285     try{
1286         int minLength = lengths[mapSeqToUnique[seqA]];
1287         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1288         
1289         int ANumFlowCells = seqA * numFlowCells;
1290         int BNumFlowCells = seqB * numFlowCells;
1291         
1292         float dist = 0;
1293         
1294         for(int i=0;i<minLength;i++){
1295             
1296             if (m->control_pressed) { break; }
1297             
1298             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1299             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1300             
1301             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1302             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1303             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1304         }
1305         
1306         dist /= (float) minLength;
1307         return dist;
1308     }
1309     catch(exception& e) {
1310         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1311         exit(1);
1312     }
1313 }
1314
1315 //**********************************************************************************************************************/
1316
1317 string ShhherCommand::cluster(string distFileName, string namesFileName){
1318     try {
1319         
1320         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1321         read->setCutoff(cutoff);
1322         
1323         NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1324         clusterNameMap->readMap();
1325         read->read(clusterNameMap);
1326         
1327         ListVector* list = read->getListVector();
1328         SparseMatrix* matrix = read->getMatrix();
1329         
1330         delete read; 
1331         delete clusterNameMap; 
1332         
1333         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1334         
1335         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1336         string tag = cluster->getTag();
1337         
1338         double clusterCutoff = cutoff;
1339         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1340             
1341             if (m->control_pressed) { break; }
1342             
1343             cluster->update(clusterCutoff);
1344         }
1345         
1346         list->setLabel(toString(cutoff));
1347         
1348         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1349         ofstream listFile;
1350         m->openOutputFile(listFileName, listFile);
1351         list->print(listFile);
1352         listFile.close();
1353         
1354         delete matrix;  delete cluster; delete rabund; delete list;
1355         
1356         return listFileName;
1357     }
1358     catch(exception& e) {
1359         m->errorOut(e, "ShhherCommand", "cluster");
1360         exit(1);        
1361     }           
1362 }
1363
1364 /**************************************************************************************************/
1365
1366 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1367     
1368     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1369     //within an otu
1370     
1371     try{
1372         
1373         for(int i=start;i<finish;i++){
1374             
1375             if (m->control_pressed) { break; }
1376             
1377             double count = 0;
1378             int position = 0;
1379             int minFlowGram = 100000000;
1380             double minFlowValue = 1e8;
1381             change[i] = 0; //FALSE
1382             
1383             for(int j=0;j<nSeqsPerOTU[i];j++){
1384                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1385             }
1386             
1387             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1388                 vector<double> adF(nSeqsPerOTU[i]);
1389                 vector<int> anL(nSeqsPerOTU[i]);
1390                 
1391                 for(int j=0;j<nSeqsPerOTU[i];j++){
1392                     int index = cumNumSeqs[i] + j;
1393                     int nI = seqIndex[index];
1394                     int nIU = mapSeqToUnique[nI];
1395                     
1396                     int k;
1397                     for(k=0;k<position;k++){
1398                         if(nIU == anL[k]){
1399                             break;
1400                         }
1401                     }
1402                     if(k == position){
1403                         anL[position] = nIU;
1404                         adF[position] = 0.0000;
1405                         position++;
1406                     }                                           
1407                 }
1408                 
1409                 for(int j=0;j<nSeqsPerOTU[i];j++){
1410                     int index = cumNumSeqs[i] + j;
1411                     int nI = seqIndex[index];
1412                     
1413                     double tauValue = singleTau[seqNumber[index]];
1414                     
1415                     for(int k=0;k<position;k++){
1416                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1417                         adF[k] += dist * tauValue;
1418                     }
1419                 }
1420                 
1421                 for(int j=0;j<position;j++){
1422                     if(adF[j] < minFlowValue){
1423                         minFlowGram = j;
1424                         minFlowValue = adF[j];
1425                     }
1426                 }
1427                 
1428                 if(centroids[i] != anL[minFlowGram]){
1429                     change[i] = 1;
1430                     centroids[i] = anL[minFlowGram];
1431                 }
1432             }
1433             else if(centroids[i] != -1){
1434                 change[i] = 1;
1435                 centroids[i] = -1;                      
1436             }
1437         }
1438     }
1439     catch(exception& e) {
1440         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1441         exit(1);        
1442     }           
1443 }
1444
1445 /**************************************************************************************************/
1446
1447 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1448     try{
1449         
1450         int flowAValue = cent * numFlowCells;
1451         int flowBValue = flow * numFlowCells;
1452         
1453         double dist = 0;
1454         
1455         for(int i=0;i<length;i++){
1456             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1457             flowAValue++;
1458             flowBValue++;
1459         }
1460         
1461         return dist / (double)length;
1462     }
1463     catch(exception& e) {
1464         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1465         exit(1);        
1466     }           
1467 }
1468
1469 /**************************************************************************************************/
1470
1471 double ShhherCommand::getNewWeights(){
1472     try{
1473         
1474         double maxChange = 0;
1475         
1476         for(int i=0;i<numOTUs;i++){
1477             
1478             if (m->control_pressed) { break; }
1479             
1480             double difference = weight[i];
1481             weight[i] = 0;
1482             
1483             for(int j=0;j<nSeqsPerOTU[i];j++){
1484                 int index = cumNumSeqs[i] + j;
1485                 double tauValue = singleTau[seqNumber[index]];
1486                 weight[i] += tauValue;
1487             }
1488             
1489             difference = fabs(weight[i] - difference);
1490             if(difference > maxChange){ maxChange = difference; }
1491         }
1492         return maxChange;
1493     }
1494     catch(exception& e) {
1495         m->errorOut(e, "ShhherCommand", "getNewWeights");
1496         exit(1);        
1497     }           
1498 }
1499  
1500  /**************************************************************************************************/
1501  
1502 double ShhherCommand::getLikelihood(){
1503     
1504     try{
1505         
1506         vector<long double> P(numSeqs, 0);
1507         int effNumOTUs = 0;
1508         
1509         for(int i=0;i<numOTUs;i++){
1510             if(weight[i] > MIN_WEIGHT){
1511                 effNumOTUs++;
1512             }
1513         }
1514         
1515         string hold;
1516         for(int i=0;i<numOTUs;i++){
1517             
1518             if (m->control_pressed) { break; }
1519             
1520             for(int j=0;j<nSeqsPerOTU[i];j++){
1521                 int index = cumNumSeqs[i] + j;
1522                 int nI = seqIndex[index];
1523                 double singleDist = dist[seqNumber[index]];
1524                 
1525                 P[nI] += weight[i] * exp(-singleDist * sigma);
1526             }
1527         }
1528         double nLL = 0.00;
1529         for(int i=0;i<numSeqs;i++){
1530             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1531             
1532             nLL += -log(P[i]);
1533         }
1534         
1535         nLL = nLL -(double)numSeqs * log(sigma);
1536         
1537         return nLL; 
1538     }
1539     catch(exception& e) {
1540         m->errorOut(e, "ShhherCommand", "getNewWeights");
1541         exit(1);        
1542     }           
1543 }
1544
1545 /**************************************************************************************************/
1546
1547 void ShhherCommand::checkCentroids(){
1548     try{
1549         vector<int> unique(numOTUs, 1);
1550         
1551         for(int i=0;i<numOTUs;i++){
1552             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1553                 unique[i] = -1;
1554             }
1555         }
1556         
1557         for(int i=0;i<numOTUs;i++){
1558             
1559             if (m->control_pressed) { break; }
1560             
1561             if(unique[i] == 1){
1562                 for(int j=i+1;j<numOTUs;j++){
1563                     if(unique[j] == 1){
1564                         
1565                         if(centroids[j] == centroids[i]){
1566                             unique[j] = 0;
1567                             centroids[j] = -1;
1568                             
1569                             weight[i] += weight[j];
1570                             weight[j] = 0.0;
1571                         }
1572                     }
1573                 }
1574             }
1575         }
1576     }
1577     catch(exception& e) {
1578         m->errorOut(e, "ShhherCommand", "checkCentroids");
1579         exit(1);        
1580     }           
1581 }
1582  /**************************************************************************************************/
1583
1584
1585  
1586 void ShhherCommand::writeQualities(vector<int> otuCounts){
1587     
1588     try {
1589         string thisOutputDir = outputDir;
1590         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1591         string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.qual";
1592         
1593         ofstream qualityFile;
1594         m->openOutputFile(qualityFileName, qualityFile);
1595         
1596         qualityFile.setf(ios::fixed, ios::floatfield);
1597         qualityFile.setf(ios::showpoint);
1598         qualityFile << setprecision(6);
1599         
1600         vector<vector<int> > qualities(numOTUs);
1601         vector<double> pr(HOMOPS, 0);
1602         
1603         
1604         for(int i=0;i<numOTUs;i++){
1605             
1606             if (m->control_pressed) { break; }
1607             
1608             int index = 0;
1609             int base = 0;
1610             
1611             if(nSeqsPerOTU[i] > 0){
1612                 qualities[i].assign(1024, -1);
1613                 
1614                 while(index < numFlowCells){
1615                     double maxPrValue = 1e8;
1616                     short maxPrIndex = -1;
1617                     double count = 0.0000;
1618                     
1619                     pr.assign(HOMOPS, 0);
1620                     
1621                     for(int j=0;j<nSeqsPerOTU[i];j++){
1622                         int lIndex = cumNumSeqs[i] + j;
1623                         double tauValue = singleTau[seqNumber[lIndex]];
1624                         int sequenceIndex = aaI[i][j];
1625                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1626                         
1627                         count += tauValue;
1628                         
1629                         for(int s=0;s<HOMOPS;s++){
1630                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1631                         }
1632                     }
1633                     
1634                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1635                     maxPrValue = pr[maxPrIndex];
1636                     
1637                     if(count > MIN_COUNT){
1638                         double U = 0.0000;
1639                         double norm = 0.0000;
1640                         
1641                         for(int s=0;s<HOMOPS;s++){
1642                             norm += exp(-(pr[s] - maxPrValue));
1643                         }
1644                         
1645                         for(int s=1;s<=maxPrIndex;s++){
1646                             int value = 0;
1647                             double temp = 0.0000;
1648                             
1649                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1650                             
1651                             if(U>0.00){
1652                                 temp = log10(U);
1653                             }
1654                             else{
1655                                 temp = -10.1;
1656                             }
1657                             temp = floor(-10 * temp);
1658                             value = (int)floor(temp);
1659                             if(value > 100){    value = 100;    }
1660                             
1661                             qualities[i][base] = (int)value;
1662                             base++;
1663                         }
1664                     }
1665                     
1666                     index++;
1667                 }
1668             }
1669             
1670             
1671             if(otuCounts[i] > 0){
1672                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1673                 
1674                 int j=4;        //need to get past the first four bases
1675                 while(qualities[i][j] != -1){
1676                     qualityFile << qualities[i][j] << ' ';
1677                     j++;
1678                 }
1679                 qualityFile << endl;
1680             }
1681         }
1682         qualityFile.close();
1683         outputNames.push_back(qualityFileName);
1684         
1685     }
1686     catch(exception& e) {
1687         m->errorOut(e, "ShhherCommand", "writeQualities");
1688         exit(1);        
1689     }           
1690 }
1691
1692 /**************************************************************************************************/
1693
1694 void ShhherCommand::writeSequences(vector<int> otuCounts){
1695     try {
1696         string thisOutputDir = outputDir;
1697         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1698         string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.fasta";
1699         ofstream fastaFile;
1700         m->openOutputFile(fastaFileName, fastaFile);
1701         
1702         vector<string> names(numOTUs, "");
1703         
1704         for(int i=0;i<numOTUs;i++){
1705             
1706             if (m->control_pressed) { break; }
1707             
1708             int index = centroids[i];
1709             
1710             if(otuCounts[i] > 0){
1711                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1712                 
1713                 string newSeq = "";
1714                 
1715                 for(int j=0;j<numFlowCells;j++){
1716                     
1717                     char base = flowOrder[j % 4];
1718                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1719                         newSeq += base;
1720                     }
1721                 }
1722                 
1723                 fastaFile << newSeq.substr(4) << endl;
1724             }
1725         }
1726         fastaFile.close();
1727         
1728         outputNames.push_back(fastaFileName);
1729         
1730         if(compositeFASTAFileName != ""){
1731             m->appendFiles(fastaFileName, compositeFASTAFileName);
1732         }
1733     }
1734     catch(exception& e) {
1735         m->errorOut(e, "ShhherCommand", "writeSequences");
1736         exit(1);        
1737     }           
1738 }
1739
1740 /**************************************************************************************************/
1741
1742 void ShhherCommand::writeNames(vector<int> otuCounts){
1743     try {
1744         string thisOutputDir = outputDir;
1745         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1746         string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.names";
1747         ofstream nameFile;
1748         m->openOutputFile(nameFileName, nameFile);
1749         
1750         for(int i=0;i<numOTUs;i++){
1751             
1752             if (m->control_pressed) { break; }
1753             
1754             if(otuCounts[i] > 0){
1755                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1756                 
1757                 for(int j=1;j<nSeqsPerOTU[i];j++){
1758                     nameFile << ',' << seqNameVector[aaI[i][j]];
1759                 }
1760                 
1761                 nameFile << endl;
1762             }
1763         }
1764         nameFile.close();
1765         outputNames.push_back(nameFileName);
1766         
1767         
1768         if(compositeNamesFileName != ""){
1769             m->appendFiles(nameFileName, compositeNamesFileName);
1770         }               
1771     }
1772     catch(exception& e) {
1773         m->errorOut(e, "ShhherCommand", "writeNames");
1774         exit(1);        
1775     }           
1776 }
1777
1778 /**************************************************************************************************/
1779
1780 void ShhherCommand::writeGroups(){
1781     try {
1782         string thisOutputDir = outputDir;
1783         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1784         string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1785         string groupFileName = fileRoot + "shhh.groups";
1786         ofstream groupFile;
1787         m->openOutputFile(groupFileName, groupFile);
1788         
1789         for(int i=0;i<numSeqs;i++){
1790             if (m->control_pressed) { break; }
1791             groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
1792         }
1793         groupFile.close();
1794         outputNames.push_back(groupFileName);
1795         
1796     }
1797     catch(exception& e) {
1798         m->errorOut(e, "ShhherCommand", "writeGroups");
1799         exit(1);        
1800     }           
1801 }
1802
1803 /**************************************************************************************************/
1804
1805 void ShhherCommand::writeClusters(vector<int> otuCounts){
1806     try {
1807         string thisOutputDir = outputDir;
1808         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1809         string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName)) + "shhh.counts";
1810         ofstream otuCountsFile;
1811         m->openOutputFile(otuCountsFileName, otuCountsFile);
1812         
1813         string bases = flowOrder;
1814         
1815         for(int i=0;i<numOTUs;i++){
1816             
1817             if (m->control_pressed) {
1818                 break;
1819             }
1820             //output the translated version of the centroid sequence for the otu
1821             if(otuCounts[i] > 0){
1822                 int index = centroids[i];
1823                 
1824                 otuCountsFile << "ideal\t";
1825                 for(int j=8;j<numFlowCells;j++){
1826                     char base = bases[j % 4];
1827                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1828                         otuCountsFile << base;
1829                     }
1830                 }
1831                 otuCountsFile << endl;
1832                 
1833                 for(int j=0;j<nSeqsPerOTU[i];j++){
1834                     int sequence = aaI[i][j];
1835                     otuCountsFile << seqNameVector[sequence] << '\t';
1836                     
1837                     string newSeq = "";
1838                     
1839                     for(int k=0;k<lengths[sequence];k++){
1840                         char base = bases[k % 4];
1841                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1842                         
1843                         for(int s=0;s<freq;s++){
1844                             newSeq += base;
1845                             //otuCountsFile << base;
1846                         }
1847                     }
1848                     otuCountsFile << newSeq.substr(4) << endl;
1849                 }
1850                 otuCountsFile << endl;
1851             }
1852         }
1853         otuCountsFile.close();
1854         outputNames.push_back(otuCountsFileName);
1855         
1856     }
1857     catch(exception& e) {
1858         m->errorOut(e, "ShhherCommand", "writeClusters");
1859         exit(1);        
1860     }           
1861 }
1862
1863 #else
1864 //**********************************************************************************************************************
1865
1866 int ShhherCommand::execute(){
1867         try {
1868                 if (abort == true) { return 0; }
1869                 
1870                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1871                 getJointLookUp();       if (m->control_pressed) { return 0; }
1872                 
1873         int numFiles = flowFileVector.size();
1874                 
1875         if (numFiles < processors) { processors = numFiles; }
1876         
1877 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1878         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size()); }
1879         else { createProcesses(flowFileVector); } //each processor processes one file
1880 #else
1881         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName, 0, flowFileVector.size());
1882 #endif
1883         
1884                 if(compositeFASTAFileName != ""){
1885                         outputNames.push_back(compositeFASTAFileName);
1886                         outputNames.push_back(compositeNamesFileName);
1887                 }
1888
1889                 m->mothurOutEndLine();
1890                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1891                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1892                 m->mothurOutEndLine();
1893                 
1894                 return 0;
1895         }
1896         catch(exception& e) {
1897                 m->errorOut(e, "ShhherCommand", "execute");
1898                 exit(1);
1899         }
1900 }
1901 #endif
1902 /**************************************************************************************************/
1903
1904 int ShhherCommand::createProcesses(vector<string> filenames){
1905     try {
1906         vector<int> processIDS;
1907                 int process = 1;
1908                 int num = 0;
1909                 
1910                 //sanity check
1911                 if (filenames.size() < processors) { processors = filenames.size(); }
1912                 
1913                 //divide the groups between the processors
1914                 vector<linePair> lines;
1915                 int numFilesPerProcessor = filenames.size() / processors;
1916                 for (int i = 0; i < processors; i++) {
1917                         int startIndex =  i * numFilesPerProcessor;
1918                         int endIndex = (i+1) * numFilesPerProcessor;
1919                         if(i == (processors - 1)){      endIndex = filenames.size();    }
1920                         lines.push_back(linePair(startIndex, endIndex));
1921                 }
1922                 
1923         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
1924                 
1925                 //loop through and create all the processes you want
1926                 while (process != processors) {
1927                         int pid = fork();
1928                         
1929                         if (pid > 0) {
1930                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1931                                 process++;
1932                         }else if (pid == 0){
1933                                 num = driver(filenames, compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp", lines[process].start, lines[process].end);
1934                                 exit(0);
1935                         }else { 
1936                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
1937                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
1938                                 exit(0);
1939                         }
1940                 }
1941                 
1942                 //do my part
1943                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[0].start, lines[0].end);
1944                 
1945                 //force parent to wait until all the processes are done
1946                 for (int i=0;i<processIDS.size();i++) { 
1947                         int temp = processIDS[i];
1948                         wait(&temp);
1949                 }
1950         
1951         #else
1952         
1953         //////////////////////////////////////////////////////////////////////////////////////////////////////
1954         
1955         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
1956         
1957         //////////////////////////////////////////////////////////////////////////////////////////////////////
1958                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
1959                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
1960                 //////////////////////////////////////////////////////////////////////////////////////////////////////
1961                 
1962                 vector<shhhFlowsData*> pDataArray; 
1963                 DWORD   dwThreadIdArray[processors-1];
1964                 HANDLE  hThreadArray[processors-1]; 
1965                 
1966                 //Create processor worker threads.
1967                 for( int i=0; i<processors-1; i++ ){
1968                         // Allocate memory for thread data.
1969                         string extension = "";
1970                         if (i != 0) { extension = toString(i) + ".temp"; }
1971                         
1972             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
1973                         pDataArray.push_back(tempFlow);
1974                         processIDS.push_back(i);
1975             
1976                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
1977                 }
1978                 
1979                 //using the main process as a worker saves time and memory
1980                 //do my part
1981                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
1982                 
1983                 //Wait until all threads have terminated.
1984                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
1985                 
1986                 //Close all thread handles and free memory allocations.
1987                 for(int i=0; i < pDataArray.size(); i++){
1988                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
1989                         CloseHandle(hThreadArray[i]);
1990                         delete pDataArray[i];
1991                 }
1992                 
1993         #endif
1994         
1995         for (int i=0;i<processIDS.size();i++) { 
1996             if (compositeFASTAFileName != "") {
1997                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
1998                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
1999                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2000                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2001             }
2002         }
2003         
2004         return 0;
2005         
2006     }
2007         catch(exception& e) {
2008                 m->errorOut(e, "ShhherCommand", "createProcesses");
2009                 exit(1);
2010         }
2011 }
2012 /**************************************************************************************************/
2013
2014 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName, int start, int end){
2015     try {
2016         
2017         for(int i=start;i<end;i++){
2018                         
2019                         if (m->control_pressed) { break; }
2020                         
2021                         string flowFileName = filenames[i];
2022             
2023                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2024                         m->mothurOut("Reading flowgrams...\n");
2025                         
2026             vector<string> seqNameVector;
2027             vector<int> lengths;
2028             vector<short> flowDataIntI;
2029             vector<double> flowDataPrI;
2030             map<string, int> nameMap;
2031             vector<short> uniqueFlowgrams;
2032             vector<int> uniqueCount;
2033             vector<int> mapSeqToUnique;
2034             vector<int> mapUniqueToSeq;
2035             vector<int> uniqueLengths;
2036             int numFlowCells;
2037             
2038             int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2039                         
2040                         if (m->control_pressed) { break; }
2041                         
2042                         m->mothurOut("Identifying unique flowgrams...\n");
2043                         int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2044                         
2045                         if (m->control_pressed) { break; }
2046                         
2047                         m->mothurOut("Calculating distances between flowgrams...\n");
2048             string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2049             unsigned long long begTime = time(NULL);
2050             double begClock = clock();
2051         
2052             flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);     
2053             
2054             m->mothurOutEndLine();
2055             m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2056
2057             
2058                         string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2059                         createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2060                         
2061                         if (m->control_pressed) { break; }
2062                         
2063                         m->mothurOut("\nClustering flowgrams...\n");
2064             string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2065                         cluster(listFileName, distFileName, namesFileName);
2066                         
2067                         if (m->control_pressed) { break; }
2068             
2069             vector<int> otuData;
2070             vector<int> cumNumSeqs;
2071             vector<int> nSeqsPerOTU;
2072             vector<vector<int> > aaP;   //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2073             vector<vector<int> > aaI;   //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2074             vector<int> seqNumber;              //tMaster->anP:         the sequence id number sorted by OTU
2075             vector<int> seqIndex;               //tMaster->anI;         the index that corresponds to seqNumber
2076
2077                         
2078                         int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2079                         
2080                         if (m->control_pressed) { break; }
2081                         
2082                         m->mothurRemove(distFileName);
2083                         m->mothurRemove(namesFileName);
2084                         m->mothurRemove(listFileName);
2085                         
2086             vector<double> dist;                //adDist - distance of sequences to centroids
2087             vector<short> change;               //did the centroid sequence change? 0 = no; 1 = yes
2088             vector<int> centroids;              //the representative flowgram for each cluster m
2089             vector<double> weight;
2090             vector<double> singleTau;   //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2091             vector<int> nSeqsBreaks;
2092             vector<int> nOTUsBreaks;
2093             
2094                         dist.assign(numSeqs * numOTUs, 0);
2095             change.assign(numOTUs, 1);
2096             centroids.assign(numOTUs, -1);
2097             weight.assign(numOTUs, 0);
2098             singleTau.assign(numSeqs, 1.0);
2099             
2100             nSeqsBreaks.assign(2, 0);
2101             nOTUsBreaks.assign(2, 0);
2102             
2103             nSeqsBreaks[0] = 0;
2104             nSeqsBreaks[1] = numSeqs;
2105             nOTUsBreaks[1] = numOTUs;
2106                         
2107                         if (m->control_pressed) { break; }
2108                         
2109                         double maxDelta = 0;
2110                         int iter = 0;
2111                         
2112                         begClock = clock();
2113                         begTime = time(NULL);
2114             
2115                         m->mothurOut("\nDenoising flowgrams...\n");
2116                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2117                         
2118                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2119                                 
2120                                 if (m->control_pressed) { break; }
2121                                 
2122                                 double cycClock = clock();
2123                                 unsigned long long cycTime = time(NULL);
2124                                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2125                                 
2126                                 if (m->control_pressed) { break; }
2127                 
2128                                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2129                                 
2130                                 if (m->control_pressed) { break; }
2131                 
2132                                 maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2133                 
2134                 if (m->control_pressed) { break; }
2135                 
2136                                 double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2137                 
2138                 if (m->control_pressed) { break; }
2139                 
2140                                 checkCentroids(numOTUs, centroids, weight);
2141                                 
2142                                 if (m->control_pressed) { break; }
2143                                 
2144                                 calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2145                                 
2146                                 if (m->control_pressed) { break; }
2147                                 
2148                                 iter++;
2149                                 
2150                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2151                 
2152                         }       
2153                         
2154                         if (m->control_pressed) { break; }
2155                         
2156                         m->mothurOut("\nFinalizing...\n");
2157                         fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2158                         
2159                         if (m->control_pressed) { break; }
2160                         
2161                         setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2162                         
2163                         if (m->control_pressed) { break; }
2164                         
2165                         vector<int> otuCounts(numOTUs, 0);
2166                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
2167                         
2168                         calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2169             
2170             if (m->control_pressed) { break; }
2171             
2172                         writeQualities(numOTUs, numFlowCells, flowFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2173                         writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, flowFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2174                         writeNames(thisCompositeNamesFileName, numOTUs, flowFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2175                         writeClusters(flowFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                       if (m->control_pressed) { break; }
2176                         writeGroups(flowFileName, numSeqs, seqNameVector);                                              if (m->control_pressed) { break; }
2177                         
2178                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2179                 }
2180                 
2181         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2182         
2183         return 0;
2184         
2185     }catch(exception& e) {
2186             m->errorOut(e, "ShhherCommand", "driver");
2187             exit(1);
2188     }
2189 }
2190
2191 /**************************************************************************************************/
2192 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2193         try{
2194        
2195                 ifstream flowFile;
2196        
2197                 m->openInputFile(filename, flowFile);
2198                 
2199                 string seqName;
2200                 int currentNumFlowCells;
2201                 float intensity;
2202         thisSeqNameVector.clear();
2203                 thisLengths.clear();
2204                 thisFlowDataIntI.clear();
2205                 thisNameMap.clear();
2206                 
2207                 flowFile >> numFlowCells;
2208                 int index = 0;//pcluster
2209                 while(!flowFile.eof()){
2210                         
2211                         if (m->control_pressed) { break; }
2212                         
2213                         flowFile >> seqName >> currentNumFlowCells;
2214                         thisLengths.push_back(currentNumFlowCells);
2215            
2216                         thisSeqNameVector.push_back(seqName);
2217                         thisNameMap[seqName] = index++;//pcluster
2218
2219                         for(int i=0;i<numFlowCells;i++){
2220                                 flowFile >> intensity;
2221                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2222                                 int intI = int(100 * intensity + 0.0001);
2223                                 thisFlowDataIntI.push_back(intI);
2224                         }
2225                         m->gobble(flowFile);
2226                 }
2227                 flowFile.close();
2228                 
2229                 int numSeqs = thisSeqNameVector.size();         
2230                 
2231                 for(int i=0;i<numSeqs;i++){
2232                         
2233                         if (m->control_pressed) { break; }
2234                         
2235                         int iNumFlowCells = i * numFlowCells;
2236                         for(int j=thisLengths[i];j<numFlowCells;j++){
2237                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2238                         }
2239                 }
2240         
2241         return numSeqs;
2242                 
2243         }
2244         catch(exception& e) {
2245                 m->errorOut(e, "ShhherCommand", "getFlowData");
2246                 exit(1);
2247         }
2248 }
2249 /**************************************************************************************************/
2250
2251 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2252         try{            
2253         
2254                 ostringstream outStream;
2255                 outStream.setf(ios::fixed, ios::floatfield);
2256                 outStream.setf(ios::dec, ios::basefield);
2257                 outStream.setf(ios::showpoint);
2258                 outStream.precision(6);
2259                 
2260                 int begTime = time(NULL);
2261                 double begClock = clock();
2262         
2263                 for(int i=0;i<stopSeq;i++){
2264                         
2265                         if (m->control_pressed) { break; }
2266                         
2267                         for(int j=0;j<i;j++){
2268                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2269                 
2270                                 if(flowDistance < 1e-6){
2271                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2272                                 }
2273                                 else if(flowDistance <= cutoff){
2274                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2275                                 }
2276                         }
2277                         if(i % 100 == 0){
2278                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
2279                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2280                                 m->mothurOutEndLine();
2281                         }
2282                 }
2283                 
2284                 ofstream distFile(distFileName.c_str());
2285                 distFile << outStream.str();            
2286                 distFile.close();
2287                 
2288                 if (m->control_pressed) {}
2289                 else {
2290                         m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2291                         m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
2292                         m->mothurOutEndLine();
2293                 }
2294         
2295         return 0;
2296         }
2297         catch(exception& e) {
2298                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2299                 exit(1);
2300         }
2301 }
2302 /**************************************************************************************************/
2303
2304 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2305         try{
2306                 int minLength = lengths[mapSeqToUnique[seqA]];
2307                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2308                 
2309                 int ANumFlowCells = seqA * numFlowCells;
2310                 int BNumFlowCells = seqB * numFlowCells;
2311                 
2312                 float dist = 0;
2313                 
2314                 for(int i=0;i<minLength;i++){
2315                         
2316                         if (m->control_pressed) { break; }
2317                         
2318                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2319                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2320                         
2321                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2322                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2323                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2324                 }
2325                 
2326                 dist /= (float) minLength;
2327                 return dist;
2328         }
2329         catch(exception& e) {
2330                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2331                 exit(1);
2332         }
2333 }
2334
2335 /**************************************************************************************************/
2336
2337 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2338         try{
2339                 int numUniques = 0;
2340                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2341                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2342                 uniqueLengths.assign(numSeqs, 0);
2343                 mapSeqToUnique.assign(numSeqs, -1);
2344                 mapUniqueToSeq.assign(numSeqs, -1);
2345                 
2346                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2347                 
2348                 for(int i=0;i<numSeqs;i++){
2349                         
2350                         if (m->control_pressed) { break; }
2351                         
2352                         int index = 0;
2353                         
2354                         vector<short> current(numFlowCells);
2355                         for(int j=0;j<numFlowCells;j++){
2356                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2357                         }
2358             
2359                         for(int j=0;j<numUniques;j++){
2360                                 int offset = j * numFlowCells;
2361                                 bool toEnd = 1;
2362                                 
2363                                 int shorterLength;
2364                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2365                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2366                 
2367                                 for(int k=0;k<shorterLength;k++){
2368                                         if(current[k] != uniqueFlowgrams[offset + k]){
2369                                                 toEnd = 0;
2370                                                 break;
2371                                         }
2372                                 }
2373                                 
2374                                 if(toEnd){
2375                                         mapSeqToUnique[i] = j;
2376                                         uniqueCount[j]++;
2377                                         index = j;
2378                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2379                                         break;
2380                                 }
2381                                 index++;
2382                         }
2383                         
2384                         if(index == numUniques){
2385                                 uniqueLengths[numUniques] = lengths[i];
2386                                 uniqueCount[numUniques] = 1;
2387                                 mapSeqToUnique[i] = numUniques;//anMap
2388                                 mapUniqueToSeq[numUniques] = i;//anF
2389                                 
2390                                 for(int k=0;k<numFlowCells;k++){
2391                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2392                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2393                                 }
2394                                 
2395                                 numUniques++;
2396                         }
2397                 }
2398                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2399                 uniqueLengths.resize(numUniques);       
2400                 
2401                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2402                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2403         
2404         return numUniques;
2405         }
2406         catch(exception& e) {
2407                 m->errorOut(e, "ShhherCommand", "getUniques");
2408                 exit(1);
2409         }
2410 }
2411 /**************************************************************************************************/
2412 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2413         try{
2414                 
2415                 vector<string> duplicateNames(numUniques, "");
2416                 for(int i=0;i<numSeqs;i++){
2417                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2418                 }
2419                 
2420                 ofstream nameFile;
2421                 m->openOutputFile(filename, nameFile);
2422                 
2423                 for(int i=0;i<numUniques;i++){
2424                         
2425                         if (m->control_pressed) { break; }
2426                         
2427             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2428                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2429                 }
2430                 
2431                 nameFile.close();
2432         
2433                 return 0;
2434         }
2435         catch(exception& e) {
2436                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2437                 exit(1);
2438         }
2439 }
2440 //**********************************************************************************************************************
2441
2442 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2443         try {
2444                 
2445                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2446                 read->setCutoff(cutoff);
2447                 
2448                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2449                 clusterNameMap->readMap();
2450                 read->read(clusterNameMap);
2451                 
2452                 ListVector* list = read->getListVector();
2453                 SparseMatrix* matrix = read->getMatrix();
2454                 
2455                 delete read; 
2456                 delete clusterNameMap; 
2457         
2458                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2459                 
2460                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2461                 string tag = cluster->getTag();
2462                 
2463                 double clusterCutoff = cutoff;
2464                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2465                         
2466                         if (m->control_pressed) { break; }
2467                         
2468                         cluster->update(clusterCutoff);
2469                 }
2470                 
2471                 list->setLabel(toString(cutoff));
2472                 
2473                 ofstream listFile;
2474                 m->openOutputFile(filename, listFile);
2475                 list->print(listFile);
2476                 listFile.close();
2477                 
2478                 delete matrix;  delete cluster; delete rabund; delete list;
2479         
2480                 return 0;
2481         }
2482         catch(exception& e) {
2483                 m->errorOut(e, "ShhherCommand", "cluster");
2484                 exit(1);        
2485         }               
2486 }
2487 /**************************************************************************************************/
2488
2489 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2490                                vector<int>& cumNumSeqs,
2491                                vector<int>& nSeqsPerOTU,
2492                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2493                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2494                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2495                                vector<int>& seqIndex,
2496                                map<string, int>& nameMap){
2497         try {
2498         
2499                 ifstream listFile;
2500                 m->openInputFile(fileName, listFile);
2501                 string label;
2502         int numOTUs;
2503                 
2504                 listFile >> label >> numOTUs;
2505         
2506                 otuData.assign(numSeqs, 0);
2507                 cumNumSeqs.assign(numOTUs, 0);
2508                 nSeqsPerOTU.assign(numOTUs, 0);
2509                 aaP.clear();aaP.resize(numOTUs);
2510                 
2511                 seqNumber.clear();
2512                 aaI.clear();
2513                 seqIndex.clear();
2514                 
2515                 string singleOTU = "";
2516                 
2517                 for(int i=0;i<numOTUs;i++){
2518                         
2519                         if (m->control_pressed) { break; }
2520             
2521                         listFile >> singleOTU;
2522                         
2523                         istringstream otuString(singleOTU);
2524             
2525                         while(otuString){
2526                                 
2527                                 string seqName = "";
2528                                 
2529                                 for(int j=0;j<singleOTU.length();j++){
2530                                         char letter = otuString.get();
2531                                         
2532                                         if(letter != ','){
2533                                                 seqName += letter;
2534                                         }
2535                                         else{
2536                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2537                                                 int index = nmIt->second;
2538                                                 
2539                                                 nameMap.erase(nmIt);
2540                                                 
2541                                                 otuData[index] = i;
2542                                                 nSeqsPerOTU[i]++;
2543                                                 aaP[i].push_back(index);
2544                                                 seqName = "";
2545                                         }
2546                                 }
2547                                 
2548                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2549                 
2550                                 int index = nmIt->second;
2551                                 nameMap.erase(nmIt);
2552                 
2553                                 otuData[index] = i;
2554                                 nSeqsPerOTU[i]++;
2555                                 aaP[i].push_back(index);        
2556                                 
2557                                 otuString.get();
2558                         }
2559                         
2560                         sort(aaP[i].begin(), aaP[i].end());
2561                         for(int j=0;j<nSeqsPerOTU[i];j++){
2562                                 seqNumber.push_back(aaP[i][j]);
2563                         }
2564                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2565                                 aaP[i].push_back(0);
2566                         }
2567                         
2568                         
2569                 }
2570                 
2571                 for(int i=1;i<numOTUs;i++){
2572                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2573                 }
2574                 aaI = aaP;
2575                 seqIndex = seqNumber;
2576                 
2577                 listFile.close();       
2578         
2579         return numOTUs;
2580                 
2581         }
2582         catch(exception& e) {
2583                 m->errorOut(e, "ShhherCommand", "getOTUData");
2584                 exit(1);        
2585         }               
2586 }
2587 /**************************************************************************************************/
2588
2589 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2590                                           vector<int>& cumNumSeqs,
2591                                           vector<int>& nSeqsPerOTU,
2592                                           vector<int>& seqIndex,
2593                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2594                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2595                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2596                                           vector<int>& mapSeqToUnique,
2597                                           vector<short>& uniqueFlowgrams,
2598                                           vector<short>& flowDataIntI,
2599                                           vector<int>& lengths,
2600                                           int numFlowCells,
2601                                           vector<int>& seqNumber){                          
2602         
2603         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2604         //within an otu
2605         
2606         try{
2607                 
2608                 for(int i=0;i<numOTUs;i++){
2609                         
2610                         if (m->control_pressed) { break; }
2611                         
2612                         double count = 0;
2613                         int position = 0;
2614                         int minFlowGram = 100000000;
2615                         double minFlowValue = 1e8;
2616                         change[i] = 0; //FALSE
2617                         
2618                         for(int j=0;j<nSeqsPerOTU[i];j++){
2619                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2620                         }
2621             
2622                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2623                                 vector<double> adF(nSeqsPerOTU[i]);
2624                                 vector<int> anL(nSeqsPerOTU[i]);
2625                                 
2626                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2627                                         int index = cumNumSeqs[i] + j;
2628                                         int nI = seqIndex[index];
2629                                         int nIU = mapSeqToUnique[nI];
2630                                         
2631                                         int k;
2632                                         for(k=0;k<position;k++){
2633                                                 if(nIU == anL[k]){
2634                                                         break;
2635                                                 }
2636                                         }
2637                                         if(k == position){
2638                                                 anL[position] = nIU;
2639                                                 adF[position] = 0.0000;
2640                                                 position++;
2641                                         }                                               
2642                                 }
2643                                 
2644                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2645                                         int index = cumNumSeqs[i] + j;
2646                                         int nI = seqIndex[index];
2647                                         
2648                                         double tauValue = singleTau[seqNumber[index]];
2649                                         
2650                                         for(int k=0;k<position;k++){
2651                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2652                                                 adF[k] += dist * tauValue;
2653                                         }
2654                                 }
2655                                 
2656                                 for(int j=0;j<position;j++){
2657                                         if(adF[j] < minFlowValue){
2658                                                 minFlowGram = j;
2659                                                 minFlowValue = adF[j];
2660                                         }
2661                                 }
2662                                 
2663                                 if(centroids[i] != anL[minFlowGram]){
2664                                         change[i] = 1;
2665                                         centroids[i] = anL[minFlowGram];
2666                                 }
2667                         }
2668                         else if(centroids[i] != -1){
2669                                 change[i] = 1;
2670                                 centroids[i] = -1;                      
2671                         }
2672                 }
2673         
2674         return 0;
2675         }
2676         catch(exception& e) {
2677                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2678                 exit(1);        
2679         }               
2680 }
2681 /**************************************************************************************************/
2682
2683 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2684                                         vector<short>& flowDataIntI, int numFlowCells){
2685         try{
2686                 
2687                 int flowAValue = cent * numFlowCells;
2688                 int flowBValue = flow * numFlowCells;
2689                 
2690                 double dist = 0;
2691         
2692                 for(int i=0;i<length;i++){
2693                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2694                         flowAValue++;
2695                         flowBValue++;
2696                 }
2697                 
2698                 return dist / (double)length;
2699         }
2700         catch(exception& e) {
2701                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2702                 exit(1);        
2703         }               
2704 }
2705 /**************************************************************************************************/
2706
2707 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2708         try{
2709                 
2710                 double maxChange = 0;
2711                 
2712                 for(int i=0;i<numOTUs;i++){
2713                         
2714                         if (m->control_pressed) { break; }
2715                         
2716                         double difference = weight[i];
2717                         weight[i] = 0;
2718                         
2719                         for(int j=0;j<nSeqsPerOTU[i];j++){
2720                                 int index = cumNumSeqs[i] + j;
2721                                 double tauValue = singleTau[seqNumber[index]];
2722                                 weight[i] += tauValue;
2723                         }
2724                         
2725                         difference = fabs(weight[i] - difference);
2726                         if(difference > maxChange){     maxChange = difference; }
2727                 }
2728                 return maxChange;
2729         }
2730         catch(exception& e) {
2731                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2732                 exit(1);        
2733         }               
2734 }
2735
2736 /**************************************************************************************************/
2737
2738 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
2739         
2740         try{
2741                 
2742                 vector<long double> P(numSeqs, 0);
2743                 int effNumOTUs = 0;
2744                 
2745                 for(int i=0;i<numOTUs;i++){
2746                         if(weight[i] > MIN_WEIGHT){
2747                                 effNumOTUs++;
2748                         }
2749                 }
2750                 
2751                 string hold;
2752                 for(int i=0;i<numOTUs;i++){
2753                         
2754                         if (m->control_pressed) { break; }
2755                         
2756                         for(int j=0;j<nSeqsPerOTU[i];j++){
2757                                 int index = cumNumSeqs[i] + j;
2758                                 int nI = seqIndex[index];
2759                                 double singleDist = dist[seqNumber[index]];
2760                                 
2761                                 P[nI] += weight[i] * exp(-singleDist * sigma);
2762                         }
2763                 }
2764                 double nLL = 0.00;
2765                 for(int i=0;i<numSeqs;i++){
2766                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
2767             
2768                         nLL += -log(P[i]);
2769                 }
2770                 
2771                 nLL = nLL -(double)numSeqs * log(sigma);
2772         
2773                 return nLL; 
2774         }
2775         catch(exception& e) {
2776                 m->errorOut(e, "ShhherCommand", "getNewWeights");
2777                 exit(1);        
2778         }               
2779 }
2780
2781 /**************************************************************************************************/
2782
2783 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
2784         try{
2785                 vector<int> unique(numOTUs, 1);
2786                 
2787                 for(int i=0;i<numOTUs;i++){
2788                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
2789                                 unique[i] = -1;
2790                         }
2791                 }
2792                 
2793                 for(int i=0;i<numOTUs;i++){
2794                         
2795                         if (m->control_pressed) { break; }
2796                         
2797                         if(unique[i] == 1){
2798                                 for(int j=i+1;j<numOTUs;j++){
2799                                         if(unique[j] == 1){
2800                                                 
2801                                                 if(centroids[j] == centroids[i]){
2802                                                         unique[j] = 0;
2803                                                         centroids[j] = -1;
2804                                                         
2805                                                         weight[i] += weight[j];
2806                                                         weight[j] = 0.0;
2807                                                 }
2808                                         }
2809                                 }
2810                         }
2811                 }
2812         
2813         return 0;
2814         }
2815         catch(exception& e) {
2816                 m->errorOut(e, "ShhherCommand", "checkCentroids");
2817                 exit(1);        
2818         }               
2819 }
2820 /**************************************************************************************************/
2821
2822 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
2823                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
2824                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
2825                                      vector<int>& seqNumber, vector<int>& seqIndex,
2826                                      vector<short>& uniqueFlowgrams,
2827                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
2828         
2829         try{
2830                 
2831                 int total = 0;
2832                 vector<double> newTau(numOTUs,0);
2833                 vector<double> norms(numSeqs, 0);
2834                 nSeqsPerOTU.assign(numOTUs, 0);
2835         
2836                 for(int i=0;i<numSeqs;i++){
2837                         
2838                         if (m->control_pressed) { break; }
2839                         
2840                         int indexOffset = i * numOTUs;
2841             
2842                         double offset = 1e8;
2843                         
2844                         for(int j=0;j<numOTUs;j++){
2845                 
2846                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
2847                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
2848                                 }
2849                 
2850                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
2851                                         offset = dist[indexOffset + j];
2852                                 }
2853                         }
2854             
2855                         for(int j=0;j<numOTUs;j++){
2856                                 if(weight[j] > MIN_WEIGHT){
2857                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
2858                                         norms[i] += newTau[j];
2859                                 }
2860                                 else{
2861                                         newTau[j] = 0.0;
2862                                 }
2863                         }
2864             
2865                         for(int j=0;j<numOTUs;j++){
2866                                 newTau[j] /= norms[i];
2867                         }
2868             
2869                         for(int j=0;j<numOTUs;j++){
2870                                 if(newTau[j] > MIN_TAU){
2871                                         
2872                                         int oldTotal = total;
2873                                         
2874                                         total++;
2875                                         
2876                                         singleTau.resize(total, 0);
2877                                         seqNumber.resize(total, 0);
2878                                         seqIndex.resize(total, 0);
2879                                         
2880                                         singleTau[oldTotal] = newTau[j];
2881                                         
2882                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
2883                                         aaI[j][nSeqsPerOTU[j]] = i;
2884                                         nSeqsPerOTU[j]++;
2885                                 }
2886                         }
2887             
2888                 }
2889         
2890         }
2891         catch(exception& e) {
2892                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
2893                 exit(1);        
2894         }               
2895 }
2896 /**************************************************************************************************/
2897
2898 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2899         try {
2900                 int index = 0;
2901                 for(int i=0;i<numOTUs;i++){
2902                         
2903                         if (m->control_pressed) { return 0; }
2904                         
2905                         cumNumSeqs[i] = index;
2906                         for(int j=0;j<nSeqsPerOTU[i];j++){
2907                                 seqNumber[index] = aaP[i][j];
2908                                 seqIndex[index] = aaI[i][j];
2909                                 
2910                                 index++;
2911                         }
2912                 }
2913         
2914         return 0; 
2915         }
2916         catch(exception& e) {
2917                 m->errorOut(e, "ShhherCommand", "fill");
2918                 exit(1);        
2919         }               
2920 }
2921 /**************************************************************************************************/
2922
2923 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
2924                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
2925         
2926         try {
2927                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
2928                 
2929                 for(int i=0;i<numOTUs;i++){
2930                         
2931                         if (m->control_pressed) { break; }
2932                         
2933                         for(int j=0;j<nSeqsPerOTU[i];j++){
2934                                 int index = cumNumSeqs[i] + j;
2935                                 double tauValue = singleTau[seqNumber[index]];
2936                                 int sIndex = seqIndex[index];
2937                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
2938                         }
2939                 }
2940                 
2941                 for(int i=0;i<numSeqs;i++){
2942                         double maxTau = -1.0000;
2943                         int maxOTU = -1;
2944                         for(int j=0;j<numOTUs;j++){
2945                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
2946                                         maxTau = bigTauMatrix[i * numOTUs + j];
2947                                         maxOTU = j;
2948                                 }
2949                         }
2950                         
2951                         otuData[i] = maxOTU;
2952                 }
2953                 
2954                 nSeqsPerOTU.assign(numOTUs, 0);         
2955                 
2956                 for(int i=0;i<numSeqs;i++){
2957                         int index = otuData[i];
2958                         
2959                         singleTau[i] = 1.0000;
2960                         dist[i] = 0.0000;
2961                         
2962                         aaP[index][nSeqsPerOTU[index]] = i;
2963                         aaI[index][nSeqsPerOTU[index]] = i;
2964                         
2965                         nSeqsPerOTU[index]++;
2966                 }
2967         
2968                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
2969         }
2970         catch(exception& e) {
2971                 m->errorOut(e, "ShhherCommand", "setOTUs");
2972                 exit(1);        
2973         }               
2974 }
2975 /**************************************************************************************************/
2976
2977 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
2978                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
2979                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
2980         
2981         try {
2982                 string thisOutputDir = outputDir;
2983                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
2984                 string qualityFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.qual";
2985         
2986                 ofstream qualityFile;
2987                 m->openOutputFile(qualityFileName, qualityFile);
2988         
2989                 qualityFile.setf(ios::fixed, ios::floatfield);
2990                 qualityFile.setf(ios::showpoint);
2991                 qualityFile << setprecision(6);
2992                 
2993                 vector<vector<int> > qualities(numOTUs);
2994                 vector<double> pr(HOMOPS, 0);
2995                 
2996                 
2997                 for(int i=0;i<numOTUs;i++){
2998                         
2999                         if (m->control_pressed) { break; }
3000                         
3001                         int index = 0;
3002                         int base = 0;
3003                         
3004                         if(nSeqsPerOTU[i] > 0){
3005                                 qualities[i].assign(1024, -1);
3006                                 
3007                                 while(index < numFlowCells){
3008                                         double maxPrValue = 1e8;
3009                                         short maxPrIndex = -1;
3010                                         double count = 0.0000;
3011                                         
3012                                         pr.assign(HOMOPS, 0);
3013                                         
3014                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3015                                                 int lIndex = cumNumSeqs[i] + j;
3016                                                 double tauValue = singleTau[seqNumber[lIndex]];
3017                                                 int sequenceIndex = aaI[i][j];
3018                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3019                                                 
3020                                                 count += tauValue;
3021                                                 
3022                                                 for(int s=0;s<HOMOPS;s++){
3023                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3024                                                 }
3025                                         }
3026                                         
3027                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3028                                         maxPrValue = pr[maxPrIndex];
3029                                         
3030                                         if(count > MIN_COUNT){
3031                                                 double U = 0.0000;
3032                                                 double norm = 0.0000;
3033                                                 
3034                                                 for(int s=0;s<HOMOPS;s++){
3035                                                         norm += exp(-(pr[s] - maxPrValue));
3036                                                 }
3037                                                 
3038                                                 for(int s=1;s<=maxPrIndex;s++){
3039                                                         int value = 0;
3040                                                         double temp = 0.0000;
3041                                                         
3042                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3043                                                         
3044                                                         if(U>0.00){
3045                                                                 temp = log10(U);
3046                                                         }
3047                                                         else{
3048                                                                 temp = -10.1;
3049                                                         }
3050                                                         temp = floor(-10 * temp);
3051                                                         value = (int)floor(temp);
3052                                                         if(value > 100){        value = 100;    }
3053                                                         
3054                                                         qualities[i][base] = (int)value;
3055                                                         base++;
3056                                                 }
3057                                         }
3058                                         
3059                                         index++;
3060                                 }
3061                         }
3062                         
3063                         
3064                         if(otuCounts[i] > 0){
3065                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3066                         
3067                                 int j=4;        //need to get past the first four bases
3068                                 while(qualities[i][j] != -1){
3069                     qualityFile << qualities[i][j] << ' ';
3070                     if (j > qualities[i].size()) { break; }
3071                     j++;
3072                                 }
3073                                 qualityFile << endl;
3074                         }
3075                 }
3076                 qualityFile.close();
3077                 outputNames.push_back(qualityFileName);
3078         
3079         }
3080         catch(exception& e) {
3081                 m->errorOut(e, "ShhherCommand", "writeQualities");
3082                 exit(1);        
3083         }               
3084 }
3085
3086 /**************************************************************************************************/
3087
3088 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string filename, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3089         try {
3090                 string thisOutputDir = outputDir;
3091                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3092                 string fastaFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.fasta";
3093                 ofstream fastaFile;
3094                 m->openOutputFile(fastaFileName, fastaFile);
3095                 
3096                 vector<string> names(numOTUs, "");
3097                 
3098                 for(int i=0;i<numOTUs;i++){
3099                         
3100                         if (m->control_pressed) { break; }
3101                         
3102                         int index = centroids[i];
3103                         
3104                         if(otuCounts[i] > 0){
3105                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3106                                 
3107                                 string newSeq = "";
3108                                 
3109                                 for(int j=0;j<numFlowCells;j++){
3110                                         
3111                                         char base = flowOrder[j % 4];
3112                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3113                                                 newSeq += base;
3114                                         }
3115                                 }
3116                                 
3117                                 fastaFile << newSeq.substr(4) << endl;
3118                         }
3119                 }
3120                 fastaFile.close();
3121         
3122                 outputNames.push_back(fastaFileName);
3123         
3124                 if(thisCompositeFASTAFileName != ""){
3125                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3126                 }
3127         }
3128         catch(exception& e) {
3129                 m->errorOut(e, "ShhherCommand", "writeSequences");
3130                 exit(1);        
3131         }               
3132 }
3133
3134 /**************************************************************************************************/
3135
3136 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string filename, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3137         try {
3138                 string thisOutputDir = outputDir;
3139                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3140                 string nameFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.names";
3141                 ofstream nameFile;
3142                 m->openOutputFile(nameFileName, nameFile);
3143                 
3144                 for(int i=0;i<numOTUs;i++){
3145                         
3146                         if (m->control_pressed) { break; }
3147                         
3148                         if(otuCounts[i] > 0){
3149                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3150                                 
3151                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3152                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3153                                 }
3154                                 
3155                                 nameFile << endl;
3156                         }
3157                 }
3158                 nameFile.close();
3159                 outputNames.push_back(nameFileName);
3160                 
3161                 
3162                 if(thisCompositeNamesFileName != ""){
3163                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3164                 }               
3165         }
3166         catch(exception& e) {
3167                 m->errorOut(e, "ShhherCommand", "writeNames");
3168                 exit(1);        
3169         }               
3170 }
3171
3172 /**************************************************************************************************/
3173
3174 void ShhherCommand::writeGroups(string filename, int numSeqs, vector<string>& seqNameVector){
3175         try {
3176                 string thisOutputDir = outputDir;
3177                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3178                 string fileRoot = thisOutputDir + m->getRootName(m->getSimpleName(filename));
3179                 string groupFileName = fileRoot + "shhh.groups";
3180                 ofstream groupFile;
3181                 m->openOutputFile(groupFileName, groupFile);
3182                 
3183                 for(int i=0;i<numSeqs;i++){
3184                         if (m->control_pressed) { break; }
3185                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3186                 }
3187                 groupFile.close();
3188                 outputNames.push_back(groupFileName);
3189         
3190         }
3191         catch(exception& e) {
3192                 m->errorOut(e, "ShhherCommand", "writeGroups");
3193                 exit(1);        
3194         }               
3195 }
3196
3197 /**************************************************************************************************/
3198
3199 void ShhherCommand::writeClusters(string filename, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3200         try {
3201                 string thisOutputDir = outputDir;
3202                 if (outputDir == "") {  thisOutputDir += m->hasPath(filename);  }
3203                 string otuCountsFileName = thisOutputDir + m->getRootName(m->getSimpleName(filename)) + "shhh.counts";
3204                 ofstream otuCountsFile;
3205                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3206                 
3207                 string bases = flowOrder;
3208                 
3209                 for(int i=0;i<numOTUs;i++){
3210                         
3211                         if (m->control_pressed) {
3212                                 break;
3213                         }
3214                         //output the translated version of the centroid sequence for the otu
3215                         if(otuCounts[i] > 0){
3216                                 int index = centroids[i];
3217                                 
3218                                 otuCountsFile << "ideal\t";
3219                                 for(int j=8;j<numFlowCells;j++){
3220                                         char base = bases[j % 4];
3221                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3222                                                 otuCountsFile << base;
3223                                         }
3224                                 }
3225                                 otuCountsFile << endl;
3226                                 
3227                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3228                                         int sequence = aaI[i][j];
3229                                         otuCountsFile << seqNameVector[sequence] << '\t';
3230                                         
3231                                         string newSeq = "";
3232                                         
3233                                         for(int k=0;k<lengths[sequence];k++){
3234                                                 char base = bases[k % 4];
3235                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3236                         
3237                                                 for(int s=0;s<freq;s++){
3238                                                         newSeq += base;
3239                                                         //otuCountsFile << base;
3240                                                 }
3241                                         }
3242                                         otuCountsFile << newSeq.substr(4) << endl;
3243                                 }
3244                                 otuCountsFile << endl;
3245                         }
3246                 }
3247                 otuCountsFile.close();
3248                 outputNames.push_back(otuCountsFileName);
3249         
3250         }
3251         catch(exception& e) {
3252                 m->errorOut(e, "ShhherCommand", "writeClusters");
3253                 exit(1);        
3254         }               
3255 }
3256
3257 /**************************************************************************************************/
3258
3259 void ShhherCommand::getSingleLookUp(){
3260         try{
3261                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3262                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3263                 
3264                 int index = 0;
3265                 ifstream lookUpFile;
3266                 m->openInputFile(lookupFileName, lookUpFile);
3267                 
3268                 for(int i=0;i<HOMOPS;i++){
3269                         
3270                         if (m->control_pressed) { break; }
3271                         
3272                         float logFracFreq;
3273                         lookUpFile >> logFracFreq;
3274                         
3275                         for(int j=0;j<NUMBINS;j++)      {
3276                                 lookUpFile >> singleLookUp[index];
3277                                 index++;
3278                         }
3279                 }       
3280                 lookUpFile.close();
3281         }
3282         catch(exception& e) {
3283                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3284                 exit(1);
3285         }
3286 }
3287
3288 /**************************************************************************************************/
3289
3290 void ShhherCommand::getJointLookUp(){
3291         try{
3292                 
3293                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3294                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3295                 
3296                 for(int i=0;i<NUMBINS;i++){
3297                         
3298                         if (m->control_pressed) { break; }
3299                         
3300                         for(int j=0;j<NUMBINS;j++){             
3301                                 
3302                                 double minSum = 100000000;
3303                                 
3304                                 for(int k=0;k<HOMOPS;k++){
3305                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3306                                         
3307                                         if(sum < minSum)        {       minSum = sum;           }
3308                                 }       
3309                                 jointLookUp[i * NUMBINS + j] = minSum;
3310                         }
3311                 }
3312         }
3313         catch(exception& e) {
3314                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3315                 exit(1);
3316         }
3317 }
3318
3319 /**************************************************************************************************/
3320
3321 double ShhherCommand::getProbIntensity(int intIntensity){                          
3322         try{
3323
3324                 double minNegLogProb = 100000000; 
3325
3326                 
3327                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3328                         
3329                         if (m->control_pressed) { break; }
3330                         
3331                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3332                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3333                 }
3334                 
3335                 return minNegLogProb;
3336         }
3337         catch(exception& e) {
3338                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3339                 exit(1);
3340         }
3341 }
3342
3343
3344
3345