]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added citation function to commands
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 #include "readcolumn.h"
13 #include "readmatrix.hpp"
14 #include "rabundvector.hpp"
15 #include "sabundvector.hpp"
16 #include "listvector.hpp"
17 #include "cluster.hpp"
18 #include "sparsematrix.hpp"
19 #include <cfloat>
20
21 //**********************************************************************************************************************
22
23 #define NUMBINS 1000
24 #define HOMOPS 10
25 #define MIN_COUNT 0.1
26 #define MIN_WEIGHT 0.1
27 #define MIN_TAU 0.0001
28 #define MIN_ITER 10
29 //**********************************************************************************************************************
30 vector<string> ShhherCommand::setParameters(){  
31         try {
32                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pflow);
33                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none",false,false); parameters.push_back(pfile);
34                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none",false,false); parameters.push_back(plookup);
35                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "",false,false); parameters.push_back(pcutoff);
36                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "",false,false); parameters.push_back(pprocessors);
37                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "",false,false); parameters.push_back(pmaxiter);
38                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "",false,false); parameters.push_back(psigma);
39                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "",false,false); parameters.push_back(pmindelta);
40                 CommandParameter porder("order", "String", "", "", "", "", "",false,false); parameters.push_back(porder);
41                 CommandParameter pinputdir("inputdir", "String", "", "", "", "", "",false,false); parameters.push_back(pinputdir);
42                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "",false,false); parameters.push_back(poutputdir);
43                 
44                 vector<string> myArray;
45                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
46                 return myArray;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "setParameters");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getHelpString(){  
55         try {
56                 string helpString = "";
57                 helpString += "The shhh.seqs command reads a file containing flowgrams and creates a file of corrected sequences.\n";
58                 return helpString;
59         }
60         catch(exception& e) {
61                 m->errorOut(e, "ShhherCommand", "getHelpString");
62                 exit(1);
63         }
64 }
65 //**********************************************************************************************************************
66
67 ShhherCommand::ShhherCommand(){ 
68         try {
69                 abort = true; calledHelp = true;
70                 setParameters();
71                 
72                 //initialize outputTypes
73                 vector<string> tempOutNames;
74                 outputTypes["pn.dist"] = tempOutNames;
75
76         }
77         catch(exception& e) {
78                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
79                 exit(1);
80         }
81 }
82
83 //**********************************************************************************************************************
84
85 ShhherCommand::ShhherCommand(string option) {
86         try {
87
88 #ifdef USE_MPI
89                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
90                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
91
92                 if(pid == 0){
93 #endif
94                 
95                 
96                 abort = false; calledHelp = false;   
97                 
98                 
99                 //allow user to run help
100                 if(option == "help") { help(); abort = true; calledHelp = true; }
101                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
102                 
103                 else {
104                         vector<string> myArray = setParameters();
105                         
106                         OptionParser parser(option);
107                         map<string,string> parameters = parser.getParameters();
108                         
109                         ValidParameters validParameter;
110                         map<string,string>::iterator it;
111                         
112                         //check to make sure all parameters are valid for command
113                         for (it = parameters.begin(); it != parameters.end(); it++) { 
114                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
115                         }
116                         
117                         //initialize outputTypes
118                         vector<string> tempOutNames;
119                         outputTypes["pn.dist"] = tempOutNames;
120                         //                      outputTypes["fasta"] = tempOutNames;
121                         
122                         //if the user changes the input directory command factory will send this info to us in the output parameter 
123                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
124                         if (inputDir == "not found"){   inputDir = "";          }
125                         else {
126                                 string path;
127                                 it = parameters.find("flow");
128                                 //user has given a template file
129                                 if(it != parameters.end()){ 
130                                         path = m->hasPath(it->second);
131                                         //if the user has not given a path then, add inputdir. else leave path alone.
132                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
133                                 }
134                                 
135                                 it = parameters.find("lookup");
136                                 //user has given a template file
137                                 if(it != parameters.end()){ 
138                                         path = m->hasPath(it->second);
139                                         //if the user has not given a path then, add inputdir. else leave path alone.
140                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
141                                 }
142
143                                 it = parameters.find("file");
144                                 //user has given a template file
145                                 if(it != parameters.end()){ 
146                                         path = m->hasPath(it->second);
147                                         //if the user has not given a path then, add inputdir. else leave path alone.
148                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
149                                 }
150                         }
151                         
152                         
153                         //check for required parameters
154                         flowFileName = validParameter.validFile(parameters, "flow", true);
155                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
156                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
157                                 m->mothurOut("values for either flow or file must be provided for the shhh.seqs command.");
158                                 m->mothurOutEndLine();
159                                 abort = true; 
160                         }
161                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
162                         
163                         if(flowFileName != "not found"){        compositeFASTAFileName = "";    }
164                         else{
165                                 compositeFASTAFileName = flowFilesFileName.substr(0, flowFilesFileName.length()-10) + "pn.fasta";
166                                 ofstream temp;
167                                 m->openOutputFile(compositeFASTAFileName, temp);
168                                 temp.close();
169                         }
170                         
171                         //if the user changes the output directory command factory will send this info to us in the output parameter 
172                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  
173                                 outputDir = ""; 
174                                 outputDir += m->hasPath(flowFileName); //if user entered a file with a path then preserve it    
175                         }
176                         
177                         
178                         //check for optional parameter and set defaults
179                         // ...at some point should added some additional type checking...
180                         string temp;
181                         temp = validParameter.validFile(parameters, "lookup", true);
182                         if (temp == "not found")        {       lookupFileName = "LookUp_Titanium.pat"; }
183                         else if(temp == "not open")     {       abort = true;                   } 
184                         else                                            {       lookupFileName = temp;  }
185                         
186                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
187                         m->setProcessors(temp);
188                         convert(temp, processors);
189
190                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
191                         convert(temp, cutoff); 
192                         
193                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
194                         convert(temp, minDelta); 
195
196                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
197                         convert(temp, maxIters); 
198
199                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
200                         convert(temp, sigma); 
201                         
202                         flowOrder = validParameter.validFile(parameters, "order", false);
203                         if (flowOrder == "not found"){ flowOrder = "TACG";              }
204                         else if(flowOrder.length() != 4){
205                                 m->mothurOut("The value of the order option must be four bases long\n");
206                         }
207                         
208                 }
209                         
210 #ifdef USE_MPI
211                 }                               
212 #endif
213                                 
214         }
215         catch(exception& e) {
216                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
217                 exit(1);
218         }
219 }
220 //**********************************************************************************************************************
221 #ifdef USE_MPI
222 int ShhherCommand::execute(){
223         try {
224                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
225                 
226                 int tag = 1976;
227                 MPI_Status status; 
228
229                 if(pid == 0){
230
231                         for(int i=1;i<ncpus;i++){
232                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
233                         }
234                         if(abort == 1){ return 0;       }
235
236                         processors = ncpus;
237                         
238                         m->mothurOut("\nGetting preliminary data...\n");
239                         getSingleLookUp();
240                         getJointLookUp();
241                         
242                         vector<string> flowFileVector;
243                         if(flowFilesFileName != "not found"){
244                                 string fName;
245
246                                 ifstream flowFilesFile;
247                                 m->openInputFile(flowFilesFileName, flowFilesFile);
248                                 while(flowFilesFile){
249                                         flowFilesFile >> fName;
250                                         flowFileVector.push_back(fName);
251                                         m->gobble(flowFilesFile);
252                                 }
253                         }
254                         else{
255                                 flowFileVector.push_back(flowFileName);
256                         }
257                         int numFiles = flowFileVector.size();
258
259                         for(int i=1;i<ncpus;i++){
260                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
261                         }
262                         
263                         for(int i=0;i<numFiles;i++){
264                                 double begClock = clock();
265                                 unsigned long int begTime = time(NULL);
266
267                                 flowFileName = flowFileVector[i];
268                                 
269                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
270                                 m->mothurOut("Reading flowgrams...\n");
271                                 getFlowData();
272
273                                 m->mothurOut("Identifying unique flowgrams...\n");
274                                 getUniques();
275
276                                 m->mothurOut("Calculating distances between flowgrams...\n");
277                                 char fileName[1024];
278                                 strcpy(fileName, flowFileName.c_str());
279
280                                 for(int i=1;i<ncpus;i++){
281                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
282
283                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
284                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
285                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
286                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
287                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
288                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
289                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
290                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
291                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
292                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
293                                 }                               
294                                                         
295                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
296                                 
297                                 int done;
298                                 for(int i=1;i<ncpus;i++){
299                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
300                                         
301                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
302                                         remove((distFileName + ".temp." + toString(i)).c_str());
303                                 }
304
305                                 string namesFileName = createNamesFile();
306                                 
307                                 m->mothurOut("\nClustering flowgrams...\n");
308                                 string listFileName = cluster(distFileName, namesFileName);
309
310                                 getOTUData(listFileName);
311                                 initPyroCluster();
312
313                                 for(int i=1;i<ncpus;i++){
314                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
315                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
316                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
317                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
318                                 }
319                                 
320                                 
321                                 double maxDelta = 0;
322                                 int iter = 0;
323                                 
324                                 int numOTUsOnCPU = numOTUs / ncpus;
325                                 int numSeqsOnCPU = numSeqs / ncpus;
326                                 m->mothurOut("\nDenoising flowgrams...\n");
327                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
328                                 
329                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
330
331                                         double cycClock = clock();
332                                         unsigned long int cycTime = time(NULL);
333                                         fill();
334
335                                         int total = singleTau.size();
336                                         for(int i=1;i<ncpus;i++){
337                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
338                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
339                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
340                                                 
341                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
342                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
343                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
344                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
345                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
346                                         }
347                                         
348                                         calcCentroidsDriver(0, numOTUsOnCPU);
349                                         
350                                         for(int i=1;i<ncpus;i++){
351                                                 int otuStart = i * numOTUs / ncpus;
352                                                 int otuStop = (i + 1) * numOTUs / ncpus;
353                                                 
354                                                 vector<int> tempCentroids(numOTUs, 0);
355                                                 vector<short> tempChange(numOTUs, 0);
356                                                 
357                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
358                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
359                                                 
360                                                 for(int j=otuStart;j<otuStop;j++){
361                                                         centroids[j] = tempCentroids[j];
362                                                         change[j] = tempChange[j];
363                                                 }
364                                         }
365                                                                         
366                                         maxDelta = getNewWeights();
367                                         double nLL = getLikelihood();
368                                         checkCentroids();
369                                         
370                                         for(int i=1;i<ncpus;i++){
371                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
372                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
373                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
374                                         }
375                                         
376                                         calcNewDistancesParent(0, numSeqsOnCPU);
377
378                                         total = singleTau.size();
379
380                                         for(int i=1;i<ncpus;i++){
381                                                 int childTotal;
382                                                 int seqStart = i * numSeqs / ncpus;
383                                                 int seqStop = (i + 1) * numSeqs / ncpus;
384                                                 
385                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
386
387                                                 vector<int> childSeqIndex(childTotal, 0);
388                                                 vector<double> childSingleTau(childTotal, 0);
389                                                 vector<double> childDist(numSeqs * numOTUs, 0);
390                                                 vector<int> otuIndex(childTotal, 0);
391                                                 
392                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
393                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
394                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
395                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
396
397                                                 int oldTotal = total;
398                                                 total += childTotal;
399                                                 singleTau.resize(total, 0);
400                                                 seqIndex.resize(total, 0);
401                                                 seqNumber.resize(total, 0);
402                                                 
403                                                 int childIndex = 0;
404                                                 
405                                                 for(int j=oldTotal;j<total;j++){
406                                                         int otuI = otuIndex[childIndex];
407                                                         int seqI = childSeqIndex[childIndex];
408                                                         
409                                                         singleTau[j] = childSingleTau[childIndex];
410                                                         
411                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
412                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
413                                                         nSeqsPerOTU[otuI]++;
414                                                         childIndex++;
415                                                 }
416                                                 
417                                                 int index = seqStart * numOTUs;
418                                                 for(int j=seqStart;j<seqStop;j++){
419                                                         for(int k=0;k<numOTUs;k++){
420                                                                 dist[index] = childDist[index];
421                                                                 index++;
422                                                         }
423                                                 }                                       
424                                         }
425                                         
426                                         iter++;
427                                         
428                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
429
430                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
431                                                 int live = 1;
432                                                 for(int i=1;i<ncpus;i++){
433                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
434                                                 }
435                                         }
436                                         else{
437                                                 int live = 0;
438                                                 for(int i=1;i<ncpus;i++){
439                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
440                                                 }
441                                         }
442                                         
443                                 }       
444                                 
445                                 m->mothurOut("\nFinalizing...\n");
446                                 fill();
447                                 setOTUs();
448                                 vector<int> otuCounts(numOTUs, 0);
449                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
450                                 calcCentroidsDriver(0, numOTUs);
451                                 writeQualities(otuCounts);
452                                 writeSequences(otuCounts);
453                                 writeNames(otuCounts);
454                                 writeClusters(otuCounts);
455                                 writeGroups();
456                                 
457                                 remove(distFileName.c_str());
458                                 remove(namesFileName.c_str());
459                                 remove(listFileName.c_str());
460                                                                  
461                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
462                         }
463                 }
464                 else{
465                         int abort = 1;
466
467                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
468                         if(abort){      return 0;       }
469
470                         int numFiles;
471                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
472
473                         for(int i=0;i<numFiles;i++){
474                                 //Now into the pyrodist part
475                                 bool live = 1;
476
477                                 char fileName[1024];
478                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
479                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
480                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
481                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
482                                 
483                                 flowDataIntI.resize(numSeqs * numFlowCells);
484                                 flowDataPrI.resize(numSeqs * numFlowCells);
485                                 mapUniqueToSeq.resize(numSeqs);
486                                 mapSeqToUnique.resize(numSeqs);
487                                 lengths.resize(numSeqs);
488                                 jointLookUp.resize(NUMBINS * NUMBINS);
489
490                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
491                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
492                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
493                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
494                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
495                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
496                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
497
498                                 flowFileName = string(fileName);
499                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
500                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
501                                 
502                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
503
504                                 int done = 1;
505                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
506
507                                 //Now into the pyronoise part
508                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
509                                 
510                                 singleLookUp.resize(HOMOPS * NUMBINS);
511                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
512                                 weight.resize(numOTUs);
513                                 centroids.resize(numOTUs);
514                                 change.resize(numOTUs);
515                                 dist.assign(numOTUs * numSeqs, 0);
516                                 nSeqsPerOTU.resize(numOTUs);
517                                 cumNumSeqs.resize(numOTUs);
518
519                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
520                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
521                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
522                                 
523                                 int startOTU = pid * numOTUs / ncpus;
524                                 int endOTU = (pid + 1) * numOTUs / ncpus;
525
526                                 int startSeq = pid * numSeqs / ncpus;
527                                 int endSeq = (pid + 1) * numSeqs /ncpus;
528                                 
529                                 int total;
530
531                                 while(live){
532                                         
533                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
534                                         singleTau.assign(total, 0.0000);
535                                         seqNumber.assign(total, 0);
536                                         seqIndex.assign(total, 0);
537
538                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
539                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
540                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
541                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
542                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
543                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
544                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
545
546                                         calcCentroidsDriver(startOTU, endOTU);
547
548                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
549                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
550
551                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
552                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
553                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
554
555                                         vector<int> otuIndex(total, 0);
556                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
557                                         total = otuIndex.size();
558                                         
559                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
560                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
561                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
562                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
563                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
564                                 
565                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
566                                 }
567                         }
568                 }               
569                 MPI_Barrier(MPI_COMM_WORLD);
570
571                 return 0;
572
573         }
574         catch(exception& e) {
575                 m->errorOut(e, "ShhherCommand", "execute");
576                 exit(1);
577         }
578 }
579
580 /**************************************************************************************************/
581
582 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
583         try{            
584                 ostringstream outStream;
585                 outStream.setf(ios::fixed, ios::floatfield);
586                 outStream.setf(ios::dec, ios::basefield);
587                 outStream.setf(ios::showpoint);
588                 outStream.precision(6);
589                 
590                 int begTime = time(NULL);
591                 double begClock = clock();
592                 
593                 for(int i=startSeq;i<stopSeq;i++){
594                         for(int j=0;j<i;j++){
595                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
596                                 
597                                 if(flowDistance < 1e-6){
598                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
599                                 }
600                                 else if(flowDistance <= cutoff){
601                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
602                                 }
603                         }
604                         if(i % 100 == 0){
605                                 m->mothurOut(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
606                         }
607                 }
608                 
609                 m->mothurOut(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
610                 
611                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
612                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
613
614                 ofstream distFile(fDistFileName.c_str());
615                 distFile << outStream.str();            
616                 distFile.close();
617                 
618                 return fDistFileName;
619         }
620         catch(exception& e) {
621                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
622                 exit(1);
623         }
624 }
625
626 #else
627 //**********************************************************************************************************************
628
629 int ShhherCommand::execute(){
630         try {
631                 if (abort == true) { return 0; }
632                 
633                 getSingleLookUp();
634                 getJointLookUp();
635                                 
636                 vector<string> flowFileVector;
637                 if(flowFilesFileName != "not found"){
638                         string fName;
639                         
640                         ifstream flowFilesFile;
641                         m->openInputFile(flowFilesFileName, flowFilesFile);
642                         while(flowFilesFile){
643                                 flowFilesFile >> fName;
644                                 flowFileVector.push_back(fName);
645                                 m->gobble(flowFilesFile);
646                         }
647                 }
648                 else{
649                         flowFileVector.push_back(flowFileName);
650                 }
651                 int numFiles = flowFileVector.size();
652                 
653                 
654                 for(int i=0;i<numFiles;i++){
655                         flowFileName = flowFileVector[i];
656
657                         m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
658                         m->mothurOut("Reading flowgrams...\n");
659                         getFlowData();
660                         
661                         m->mothurOut("Identifying unique flowgrams...\n");
662                         getUniques();
663                         
664                         
665                         m->mothurOut("Calculating distances between flowgrams...\n");
666                         string distFileName = createDistFile(processors);
667                         string namesFileName = createNamesFile();
668                                 
669                         m->mothurOut("\nClustering flowgrams...\n");
670                         string listFileName = cluster(distFileName, namesFileName);
671                         getOTUData(listFileName);
672                         
673                         initPyroCluster();
674                         
675                         double maxDelta = 0;
676                         int iter = 0;
677                         
678                         double begClock = clock();
679                         unsigned long int begTime = time(NULL);
680
681                         m->mothurOut("\nDenoising flowgrams...\n");
682                         m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
683                         
684                         while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
685                                 
686                                 double cycClock = clock();
687                                 unsigned long int cycTime = time(NULL);
688                                 fill();
689                                 
690                                 calcCentroids();
691                                 
692                                 maxDelta = getNewWeights();
693                                 double nLL = getLikelihood();
694                                 checkCentroids();
695                                 
696                                 calcNewDistances();
697
698                                 iter++;
699                                 
700                                 m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
701
702                         }       
703                         
704                         m->mothurOut("\nFinalizing...\n");
705                         fill();
706                         setOTUs();
707                         
708                         vector<int> otuCounts(numOTUs, 0);
709                         for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
710                         
711                         calcCentroidsDriver(0, numOTUs);
712                         writeQualities(otuCounts);
713                         writeSequences(otuCounts);
714                         writeNames(otuCounts);
715                         writeClusters(otuCounts);
716                         writeGroups();
717                         
718                         remove(distFileName.c_str());
719                         remove(namesFileName.c_str());
720                         remove(listFileName.c_str());
721                         
722                         m->mothurOut("Total time to process " + flowFileName + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
723                 }
724                 return 0;
725         }
726         catch(exception& e) {
727                 m->errorOut(e, "ShhherCommand", "execute");
728                 exit(1);
729         }
730 }
731 #endif
732 /**************************************************************************************************/
733
734 void ShhherCommand::getFlowData(){
735         try{
736                 ifstream flowFile;
737                 m->openInputFile(flowFileName, flowFile);
738                 
739                 string seqName;
740                 seqNameVector.clear();
741                 lengths.clear();
742                 flowDataIntI.clear();
743                 nameMap.clear();
744                 
745                 
746                 int currentNumFlowCells;
747                 
748                 float intensity;
749                 
750                 flowFile >> numFlowCells;
751                 int index = 0;//pcluster
752                 while(!flowFile.eof()){
753                         flowFile >> seqName >> currentNumFlowCells;
754                         lengths.push_back(currentNumFlowCells);
755
756                         seqNameVector.push_back(seqName);
757                         nameMap[seqName] = index++;//pcluster
758
759                         for(int i=0;i<numFlowCells;i++){
760                                 flowFile >> intensity;
761                                 if(intensity > 9.99)    {       intensity = 9.99;       }
762                                 int intI = int(100 * intensity + 0.0001);
763                                 flowDataIntI.push_back(intI);
764                         }
765                         m->gobble(flowFile);
766                 }
767                 flowFile.close();
768                 
769                 numSeqs = seqNameVector.size();         
770                 
771                 for(int i=0;i<numSeqs;i++){
772                         int iNumFlowCells = i * numFlowCells;
773                         for(int j=lengths[i];j<numFlowCells;j++){
774                                 flowDataIntI[iNumFlowCells + j] = 0;
775                         }
776                 }
777                 
778         }
779         catch(exception& e) {
780                 m->errorOut(e, "ShhherCommand", "getFlowData");
781                 exit(1);
782         }
783 }
784
785 /**************************************************************************************************/
786
787 void ShhherCommand::getSingleLookUp(){
788         try{
789                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
790                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
791                 
792                 int index = 0;
793                 ifstream lookUpFile;
794                 m->openInputFile(lookupFileName, lookUpFile);
795                 
796                 for(int i=0;i<HOMOPS;i++){
797                         float logFracFreq;
798                         lookUpFile >> logFracFreq;
799                         
800                         for(int j=0;j<NUMBINS;j++)      {
801                                 lookUpFile >> singleLookUp[index];
802                                 index++;
803                         }
804                 }       
805                 lookUpFile.close();
806         }
807         catch(exception& e) {
808                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
809                 exit(1);
810         }
811 }
812
813 /**************************************************************************************************/
814
815 void ShhherCommand::getJointLookUp(){
816         try{
817                 
818                 //      the most likely joint probability (-log) that two intenities have the same polymer length
819                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
820                 
821                 for(int i=0;i<NUMBINS;i++){
822                         for(int j=0;j<NUMBINS;j++){             
823                                 
824                                 double minSum = 100000000;
825                                 
826                                 for(int k=0;k<HOMOPS;k++){
827                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
828                                         
829                                         if(sum < minSum)        {       minSum = sum;           }
830                                 }       
831                                 jointLookUp[i * NUMBINS + j] = minSum;
832                         }
833                 }
834         }
835         catch(exception& e) {
836                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
837                 exit(1);
838         }
839 }
840
841 /**************************************************************************************************/
842
843 double ShhherCommand::getProbIntensity(int intIntensity){                          
844         try{
845
846                 double minNegLogProb = 100000000; 
847
848                 
849                 for(int i=0;i<HOMOPS;i++){//loop signal strength
850                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
851                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
852                 }
853                 
854                 return minNegLogProb;
855         }
856         catch(exception& e) {
857                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
858                 exit(1);
859         }
860 }
861
862 /**************************************************************************************************/
863
864 void ShhherCommand::getUniques(){
865         try{
866                 
867                 
868                 numUniques = 0;
869                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
870                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
871                 uniqueLengths.assign(numSeqs, 0);
872                 mapSeqToUnique.assign(numSeqs, -1);
873                 mapUniqueToSeq.assign(numSeqs, -1);
874                 
875                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
876                 
877                 for(int i=0;i<numSeqs;i++){
878                         int index = 0;
879                         
880                         vector<short> current(numFlowCells);
881                         for(int j=0;j<numFlowCells;j++){
882                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
883                         }
884                                                 
885                         for(int j=0;j<numUniques;j++){
886                                 int offset = j * numFlowCells;
887                                 bool toEnd = 1;
888                                 
889                                 int shorterLength;
890                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
891                                 else                                                            {       shorterLength = uniqueLengths[j];       }
892
893                                 for(int k=0;k<shorterLength;k++){
894                                         if(current[k] != uniqueFlowgrams[offset + k]){
895                                                 toEnd = 0;
896                                                 break;
897                                         }
898                                 }
899                                 
900                                 if(toEnd){
901                                         mapSeqToUnique[i] = j;
902                                         uniqueCount[j]++;
903                                         index = j;
904                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
905                                         break;
906                                 }
907                                 index++;
908                         }
909                         
910                         if(index == numUniques){
911                                 uniqueLengths[numUniques] = lengths[i];
912                                 uniqueCount[numUniques] = 1;
913                                 mapSeqToUnique[i] = numUniques;//anMap
914                                 mapUniqueToSeq[numUniques] = i;//anF
915                                 
916                                 for(int k=0;k<numFlowCells;k++){
917                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
918                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
919                                 }
920                                 
921                                 numUniques++;
922                         }
923                 }
924                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
925                 uniqueLengths.resize(numUniques);       
926                 
927                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
928                 for(int i=0;i<flowDataPrI.size();i++)   {       flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);             }
929         }
930         catch(exception& e) {
931                 m->errorOut(e, "ShhherCommand", "getUniques");
932                 exit(1);
933         }
934 }
935
936 /**************************************************************************************************/
937
938 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
939         try{
940                 int minLength = lengths[mapSeqToUnique[seqA]];
941                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
942                 
943                 int ANumFlowCells = seqA * numFlowCells;
944                 int BNumFlowCells = seqB * numFlowCells;
945                 
946                 float dist = 0;
947                 
948                 for(int i=0;i<minLength;i++){
949                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
950                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
951                         
952                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
953                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
954                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
955                 }
956                 
957                 dist /= (float) minLength;
958                 return dist;
959         }
960         catch(exception& e) {
961                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
962                 exit(1);
963         }
964 }
965
966 /**************************************************************************************************/
967
968 void ShhherCommand::flowDistParentFork(string distFileName, int startSeq, int stopSeq){
969         try{            
970
971                 ostringstream outStream;
972                 outStream.setf(ios::fixed, ios::floatfield);
973                 outStream.setf(ios::dec, ios::basefield);
974                 outStream.setf(ios::showpoint);
975                 outStream.precision(6);
976                 
977                 int begTime = time(NULL);
978                 double begClock = clock();
979
980                 for(int i=startSeq;i<stopSeq;i++){
981                         for(int j=0;j<i;j++){
982                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
983
984                                 if(flowDistance < 1e-6){
985                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
986                                 }
987                                 else if(flowDistance <= cutoff){
988                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
989                                 }
990                         }
991                         if(i % 100 == 0){
992                                 m->mothurOut(toString(i) + "\t" + toString(time(NULL) - begTime));
993                                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
994                                 m->mothurOutEndLine();
995                         }
996                 }
997                 m->mothurOut(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
998                 m->mothurOut("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC));
999                 m->mothurOutEndLine();
1000                 
1001                 ofstream distFile(distFileName.c_str());
1002                 distFile << outStream.str();            
1003                 distFile.close();
1004         }
1005         catch(exception& e) {
1006                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
1007                 exit(1);
1008         }
1009 }
1010
1011 /**************************************************************************************************/
1012
1013 string ShhherCommand::createDistFile(int processors){
1014         try{
1015                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.dist";
1016                                 
1017                 unsigned long int begTime = time(NULL);
1018                 double begClock = clock();
1019
1020                 vector<int> start;
1021                 vector<int> end;
1022                 
1023 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1024                 if(processors == 1)     {       flowDistParentFork(fDistFileName, 0, numUniques);               }
1025                 else{ //you have multiple processors
1026                         
1027                         if (numSeqs < processors){      processors = 1; }
1028                         
1029                         vector<int> start(processors, 0);
1030                         vector<int> end(processors, 0);
1031                         
1032                         for (int i = 0; i < processors; i++) {
1033                                 start[i] = int(sqrt(float(i)/float(processors)) * numUniques);
1034                                 end[i] = int(sqrt(float(i+1)/float(processors)) * numUniques);
1035                         }
1036                         
1037                         int process = 1;
1038                         vector<int> processIDs;
1039                         
1040                         //loop through and create all the processes you want
1041                         while (process != processors) {
1042                                 int pid = fork();
1043                                 
1044                                 if (pid > 0) {
1045                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1046                                         process++;
1047                                 }else if (pid == 0){
1048                                         flowDistParentFork(fDistFileName + toString(getpid()) + ".temp", start[process], end[process]);
1049                                         exit(0);
1050                                 }else { 
1051                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1052                                         perror(" : ");
1053                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1054                                         exit(0);
1055                                 }
1056                         }
1057                         
1058                         //parent does its part
1059                         flowDistParentFork(fDistFileName, start[0], end[0]);
1060                         
1061                         //force parent to wait until all the processes are done
1062                         for (int i=0;i<processIDs.size();i++) { 
1063                                 int temp = processIDs[i];
1064                                 wait(&temp);
1065                         }
1066                         
1067                         //append and remove temp files
1068                         for (int i=0;i<processIDs.size();i++) { 
1069                                 m->appendFiles((fDistFileName + toString(processIDs[i]) + ".temp"), fDistFileName);
1070                                 remove((fDistFileName + toString(processIDs[i]) + ".temp").c_str());
1071                         }
1072                         
1073                 }
1074                 
1075 #else
1076                 flowDistParentFork(fDistFileName, 0, numUniques);
1077 #endif
1078
1079                 m->mothurOutEndLine();
1080                 
1081                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
1082                 
1083
1084                 return fDistFileName;
1085         }
1086         catch(exception& e) {
1087                 m->errorOut(e, "ShhherCommand", "createDistFile");
1088                 exit(1);
1089         }
1090         
1091 }
1092
1093 /**************************************************************************************************/
1094
1095 string ShhherCommand::createNamesFile(){
1096         try{
1097                 
1098                 vector<string> duplicateNames(numUniques, "");
1099                 for(int i=0;i<numSeqs;i++){
1100                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
1101                 }
1102                 
1103                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.names";
1104                 
1105                 ofstream nameFile;
1106                 m->openOutputFile(nameFileName, nameFile);
1107                 
1108                 for(int i=0;i<numUniques;i++){
1109 //                      nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1110                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
1111                 }
1112                 
1113                 nameFile.close();
1114                 return  nameFileName;
1115         }
1116         catch(exception& e) {
1117                 m->errorOut(e, "ShhherCommand", "createNamesFile");
1118                 exit(1);
1119         }
1120 }
1121
1122 //**********************************************************************************************************************
1123
1124 string ShhherCommand::cluster(string distFileName, string namesFileName){
1125         try {
1126                 
1127                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1128                 read->setCutoff(cutoff);
1129                 
1130                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1131                 clusterNameMap->readMap();
1132                 read->read(clusterNameMap);
1133                 
1134                 ListVector* list = read->getListVector();
1135                 SparseMatrix* matrix = read->getMatrix();
1136                 
1137                 delete read; 
1138                 delete clusterNameMap; 
1139                                 
1140                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1141                 
1142                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1143                 string tag = cluster->getTag();
1144                 
1145                 double clusterCutoff = cutoff;
1146                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1147                         cluster->update(clusterCutoff);
1148                 }
1149                 
1150                 list->setLabel(toString(cutoff));
1151                 
1152                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.list";
1153                 ofstream listFile;
1154                 m->openOutputFile(listFileName, listFile);
1155                 list->print(listFile);
1156                 listFile.close();
1157                 
1158                 delete matrix;  delete cluster; delete rabund; delete list;
1159         
1160                 return listFileName;
1161         }
1162         catch(exception& e) {
1163                 m->errorOut(e, "ShhherCommand", "cluster");
1164                 exit(1);        
1165         }               
1166 }
1167
1168 /**************************************************************************************************/
1169
1170 void ShhherCommand::getOTUData(string listFileName){
1171         try {
1172
1173                 ifstream listFile;
1174                 m->openInputFile(listFileName, listFile);
1175                 string label;
1176                 
1177                 listFile >> label >> numOTUs;
1178
1179                 otuData.assign(numSeqs, 0);
1180                 cumNumSeqs.assign(numOTUs, 0);
1181                 nSeqsPerOTU.assign(numOTUs, 0);
1182                 aaP.clear();aaP.resize(numOTUs);
1183                 
1184                 seqNumber.clear();
1185                 aaI.clear();
1186                 seqIndex.clear();
1187                 
1188                 string singleOTU = "";
1189                 
1190                 for(int i=0;i<numOTUs;i++){
1191
1192                         listFile >> singleOTU;
1193                         
1194                         istringstream otuString(singleOTU);
1195
1196                         while(otuString){
1197                                 
1198                                 string seqName = "";
1199                                 
1200                                 for(int j=0;j<singleOTU.length();j++){
1201                                         char letter = otuString.get();
1202                                         
1203                                         if(letter != ','){
1204                                                 seqName += letter;
1205                                         }
1206                                         else{
1207                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1208                                                 int index = nmIt->second;
1209                                                 
1210                                                 nameMap.erase(nmIt);
1211                                                 
1212                                                 otuData[index] = i;
1213                                                 nSeqsPerOTU[i]++;
1214                                                 aaP[i].push_back(index);
1215                                                 seqName = "";
1216                                         }
1217                                 }
1218                                 
1219                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
1220
1221                                 int index = nmIt->second;
1222                                 nameMap.erase(nmIt);
1223
1224                                 otuData[index] = i;
1225                                 nSeqsPerOTU[i]++;
1226                                 aaP[i].push_back(index);        
1227                                 
1228                                 otuString.get();
1229                         }
1230                         
1231                         sort(aaP[i].begin(), aaP[i].end());
1232                         for(int j=0;j<nSeqsPerOTU[i];j++){
1233                                 seqNumber.push_back(aaP[i][j]);
1234                         }
1235                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
1236                                 aaP[i].push_back(0);
1237                         }
1238                         
1239                         
1240                 }
1241                 
1242                 for(int i=1;i<numOTUs;i++){
1243                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
1244                 }
1245                 aaI = aaP;
1246                 seqIndex = seqNumber;
1247                 
1248                 listFile.close();       
1249                 
1250         }
1251         catch(exception& e) {
1252                 m->errorOut(e, "ShhherCommand", "getOTUData");
1253                 exit(1);        
1254         }               
1255 }
1256
1257 /**************************************************************************************************/
1258
1259 void ShhherCommand::initPyroCluster(){                          
1260         try{
1261                 dist.assign(numSeqs * numOTUs, 0);
1262                 change.assign(numOTUs, 1);
1263                 centroids.assign(numOTUs, -1);
1264                 weight.assign(numOTUs, 0);
1265                 singleTau.assign(numSeqs, 1.0);
1266                 
1267                 nSeqsBreaks.assign(processors+1, 0);
1268                 nOTUsBreaks.assign(processors+1, 0);
1269
1270                 nSeqsBreaks[0] = 0;
1271                 for(int i=0;i<processors;i++){
1272                         nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
1273                         nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
1274                 }
1275                 nSeqsBreaks[processors] = numSeqs;
1276                 nOTUsBreaks[processors] = numOTUs;
1277         }
1278         catch(exception& e) {
1279                 m->errorOut(e, "ShhherCommand", "initPyroCluster");
1280                 exit(1);        
1281         }
1282 }
1283
1284 /**************************************************************************************************/
1285
1286 void ShhherCommand::fill(){
1287         try {
1288                 int index = 0;
1289                 for(int i=0;i<numOTUs;i++){
1290                         cumNumSeqs[i] = index;
1291                         for(int j=0;j<nSeqsPerOTU[i];j++){
1292                                 seqNumber[index] = aaP[i][j];
1293                                 seqIndex[index] = aaI[i][j];
1294                                 
1295                                 index++;
1296                         }
1297                 }
1298         }
1299         catch(exception& e) {
1300                 m->errorOut(e, "ShhherCommand", "fill");
1301                 exit(1);        
1302         }               
1303 }
1304
1305 /**************************************************************************************************/
1306
1307 void ShhherCommand::calcCentroids(){                          
1308         try{
1309                 
1310 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1311
1312                 if(processors == 1)     {
1313                         calcCentroidsDriver(0, numOTUs);                
1314                 }
1315                 else{ //you have multiple processors
1316                         if (numOTUs < processors){      processors = 1; }
1317                         
1318                         int process = 1;
1319                         vector<int> processIDs;
1320                         
1321                         //loop through and create all the processes you want
1322                         while (process != processors) {
1323                                 int pid = vfork();
1324                                 
1325                                 if (pid > 0) {
1326                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1327                                         process++;
1328                                 }else if (pid == 0){
1329                                         calcCentroidsDriver(nOTUsBreaks[process], nOTUsBreaks[process+1]);
1330                                         exit(0);
1331                                 }else { 
1332                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1333                                         perror(" : ");
1334                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1335                                         exit(0);
1336                                 }
1337                         }
1338                         
1339                         //parent does its part
1340                         calcCentroidsDriver(nOTUsBreaks[0], nOTUsBreaks[1]);
1341
1342                         //force parent to wait until all the processes are done
1343                         for (int i=0;i<processIDs.size();i++) { 
1344                                 int temp = processIDs[i];
1345                                 wait(&temp);
1346                         }
1347                 }
1348                 
1349 #else
1350                 calcCentroidsDriver(0, numOTUs);
1351 #endif          
1352         }
1353         catch(exception& e) {
1354                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1355                 exit(1);        
1356         }               
1357 }
1358
1359 /**************************************************************************************************/
1360
1361 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1362         
1363         //this function gets the most likely homopolymer length at a flow position for a group of sequences
1364         //within an otu
1365         
1366         try{
1367                 
1368         
1369                 for(int i=start;i<finish;i++){
1370                         
1371                         double count = 0;
1372                         int position = 0;
1373                         int minFlowGram = 100000000;
1374                         double minFlowValue = 1e8;
1375                         change[i] = 0; //FALSE
1376                         
1377                         for(int j=0;j<nSeqsPerOTU[i];j++){
1378                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1379                         }
1380
1381                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1382                                 vector<double> adF(nSeqsPerOTU[i]);
1383                                 vector<int> anL(nSeqsPerOTU[i]);
1384                                 
1385                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1386                                         int index = cumNumSeqs[i] + j;
1387                                         int nI = seqIndex[index];
1388                                         int nIU = mapSeqToUnique[nI];
1389                                         
1390                                         int k;
1391                                         for(k=0;k<position;k++){
1392                                                 if(nIU == anL[k]){
1393                                                         break;
1394                                                 }
1395                                         }
1396                                         if(k == position){
1397                                                 anL[position] = nIU;
1398                                                 adF[position] = 0.0000;
1399                                                 position++;
1400                                         }                                               
1401                                 }
1402                                 
1403                                 for(int j=0;j<nSeqsPerOTU[i];j++){
1404                                         int index = cumNumSeqs[i] + j;
1405                                         int nI = seqIndex[index];
1406                                         
1407                                         double tauValue = singleTau[seqNumber[index]];
1408                                         
1409                                         for(int k=0;k<position;k++){
1410                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1411                                                 adF[k] += dist * tauValue;
1412                                         }
1413                                 }
1414                                 
1415                                 for(int j=0;j<position;j++){
1416                                         if(adF[j] < minFlowValue){
1417                                                 minFlowGram = j;
1418                                                 minFlowValue = adF[j];
1419                                         }
1420                                 }
1421                                 
1422                                 if(centroids[i] != anL[minFlowGram]){
1423                                         change[i] = 1;
1424                                         centroids[i] = anL[minFlowGram];
1425                                 }
1426                         }
1427                         else if(centroids[i] != -1){
1428                                 change[i] = 1;
1429                                 centroids[i] = -1;                      
1430                         }
1431                 }
1432         }
1433         catch(exception& e) {
1434                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1435                 exit(1);        
1436         }               
1437 }
1438
1439 /**************************************************************************************************/
1440
1441 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1442         try{
1443                 
1444                 int flowAValue = cent * numFlowCells;
1445                 int flowBValue = flow * numFlowCells;
1446                 
1447                 double dist = 0;
1448                 
1449                 for(int i=0;i<length;i++){
1450                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1451                         flowAValue++;
1452                         flowBValue++;
1453                 }
1454                 
1455                 return dist / (double)length;
1456         }
1457         catch(exception& e) {
1458                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1459                 exit(1);        
1460         }               
1461 }
1462
1463 /**************************************************************************************************/
1464
1465 double ShhherCommand::getNewWeights(){
1466         try{
1467                 
1468                 double maxChange = 0;
1469                 
1470                 for(int i=0;i<numOTUs;i++){
1471                         
1472                         double difference = weight[i];
1473                         weight[i] = 0;
1474                         
1475                         for(int j=0;j<nSeqsPerOTU[i];j++){
1476                                 int index = cumNumSeqs[i] + j;
1477                                 double tauValue = singleTau[seqNumber[index]];
1478                                 weight[i] += tauValue;
1479                         }
1480                         
1481                         difference = fabs(weight[i] - difference);
1482                         if(difference > maxChange){     maxChange = difference; }
1483                 }
1484                 return maxChange;
1485         }
1486         catch(exception& e) {
1487                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1488                 exit(1);        
1489         }               
1490 }
1491
1492 /**************************************************************************************************/
1493
1494 double ShhherCommand::getLikelihood(){
1495         
1496         try{
1497                 
1498                 vector<long double> P(numSeqs, 0);
1499                 int effNumOTUs = 0;
1500                 
1501                 for(int i=0;i<numOTUs;i++){
1502                         if(weight[i] > MIN_WEIGHT){
1503                                 effNumOTUs++;
1504                         }
1505                 }
1506                 
1507                 string hold;
1508                 for(int i=0;i<numOTUs;i++){
1509                         for(int j=0;j<nSeqsPerOTU[i];j++){
1510                                 int index = cumNumSeqs[i] + j;
1511                                 int nI = seqIndex[index];
1512                                 double singleDist = dist[seqNumber[index]];
1513                                 
1514                                 P[nI] += weight[i] * exp(-singleDist * sigma);
1515                         }
1516                 }
1517                 double nLL = 0.00;
1518                 for(int i=0;i<numSeqs;i++){
1519                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
1520
1521                         nLL += -log(P[i]);
1522                 }
1523                 
1524                 nLL = nLL -(double)numSeqs * log(sigma);
1525
1526                 return nLL; 
1527         }
1528         catch(exception& e) {
1529                 m->errorOut(e, "ShhherCommand", "getNewWeights");
1530                 exit(1);        
1531         }               
1532 }
1533
1534 /**************************************************************************************************/
1535
1536 void ShhherCommand::checkCentroids(){
1537         try{
1538                 vector<int> unique(numOTUs, 1);
1539                 
1540                 for(int i=0;i<numOTUs;i++){
1541                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1542                                 unique[i] = -1;
1543                         }
1544                 }
1545                 
1546                 for(int i=0;i<numOTUs;i++){
1547                         if(unique[i] == 1){
1548                                 for(int j=i+1;j<numOTUs;j++){
1549                                         if(unique[j] == 1){
1550                                                 
1551                                                 if(centroids[j] == centroids[i]){
1552                                                         unique[j] = 0;
1553                                                         centroids[j] = -1;
1554                                                         
1555                                                         weight[i] += weight[j];
1556                                                         weight[j] = 0.0;
1557                                                 }
1558                                         }
1559                                 }
1560                         }
1561                 }
1562         }
1563         catch(exception& e) {
1564                 m->errorOut(e, "ShhherCommand", "checkCentroids");
1565                 exit(1);        
1566         }               
1567 }
1568
1569 /**************************************************************************************************/
1570
1571 void ShhherCommand::calcNewDistances(){                          
1572         try{
1573                 
1574 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux)
1575                 
1576                 if(processors == 1)     {
1577                         calcNewDistancesParent(0, numSeqs);             
1578                 }
1579                 else{ //you have multiple processors
1580                         if (numSeqs < processors){      processors = 1; }
1581                         
1582                         vector<vector<int> > child_otuIndex(processors);
1583                         vector<vector<int> > child_seqIndex(processors);
1584                         vector<vector<double> > child_singleTau(processors);                    
1585                         vector<int> totals(processors);
1586                         
1587                         int process = 1;
1588                         vector<int> processIDs;
1589
1590                         //loop through and create all the processes you want
1591                         while (process != processors) {
1592                                 int pid = vfork();
1593                                 
1594                                 if (pid > 0) {
1595                                         processIDs.push_back(pid);  //create map from line number to pid so you can append files in correct order later
1596                                         process++;
1597                                 }else if (pid == 0){
1598                                         calcNewDistancesChild(nSeqsBreaks[process], nSeqsBreaks[process+1], child_otuIndex[process], child_seqIndex[process], child_singleTau[process]);
1599                                         totals[process] = child_otuIndex[process].size();
1600
1601                                         exit(0);
1602                                 }else { 
1603                                         m->mothurOut("[ERROR]: unable to spawn the necessary processes. Error code: " + toString(pid)); m->mothurOutEndLine(); 
1604                                         perror(" : ");
1605                                         for (int i=0;i<processIDs.size();i++) {  int temp = processIDs[i]; kill (temp, SIGINT); }
1606                                         exit(0);
1607                                 }
1608                         }
1609                         
1610                         //parent does its part
1611                         calcNewDistancesParent(nSeqsBreaks[0], nSeqsBreaks[1]);
1612                         int total = seqIndex.size();
1613
1614                         //force parent to wait until all the processes are done
1615                         for (int i=0;i<processIDs.size();i++) { 
1616                                 int temp = processIDs[i];
1617                                 wait(&temp);
1618                         }
1619
1620                         for(int i=1;i<processors;i++){
1621                                 int oldTotal = total;
1622                                 total += totals[i];
1623
1624                                 singleTau.resize(total, 0);
1625                                 seqIndex.resize(total, 0);
1626                                 seqNumber.resize(total, 0);
1627                                 
1628                                 int childIndex = 0;
1629                                 
1630                                 for(int j=oldTotal;j<total;j++){
1631                                         int otuI = child_otuIndex[i][childIndex];
1632                                         int seqI = child_seqIndex[i][childIndex];
1633
1634                                         singleTau[j] = child_singleTau[i][childIndex];
1635                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
1636                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
1637                                         nSeqsPerOTU[otuI]++;
1638
1639                                         childIndex++;
1640                                 }
1641                         }
1642                 }
1643 #else
1644                 calcNewDistancesParent(0, numSeqs);             
1645 #endif          
1646         }
1647         catch(exception& e) {
1648                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1649                 exit(1);        
1650         }               
1651 }
1652
1653 /**************************************************************************************************/
1654 #ifdef USE_MPI
1655 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1656         
1657         try{
1658                 vector<double> newTau(numOTUs,0);
1659                 vector<double> norms(numSeqs, 0);
1660                 otuIndex.clear();
1661                 seqIndex.clear();
1662                 singleTau.clear();
1663                 
1664                 
1665                 
1666                 for(int i=startSeq;i<stopSeq;i++){
1667                         double offset = 1e8;
1668                         int indexOffset = i * numOTUs;
1669                         
1670                         for(int j=0;j<numOTUs;j++){
1671                                 
1672                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1673                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1674                                 }
1675                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1676                                         offset = dist[indexOffset + j];
1677                                 }
1678                         }
1679                         
1680                         for(int j=0;j<numOTUs;j++){
1681                                 if(weight[j] > MIN_WEIGHT){
1682                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1683                                         norms[i] += newTau[j];
1684                                 }
1685                                 else{
1686                                         newTau[j] = 0.0;
1687                                 }
1688                         }
1689                         
1690                         for(int j=0;j<numOTUs;j++){
1691
1692                                 newTau[j] /= norms[i];
1693                                 
1694                                 if(newTau[j] > MIN_TAU){
1695                                         otuIndex.push_back(j);
1696                                         seqIndex.push_back(i);
1697                                         singleTau.push_back(newTau[j]);
1698                                 }
1699                         }
1700                         
1701                 }
1702         }
1703         catch(exception& e) {
1704                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1705                 exit(1);        
1706         }               
1707 }
1708 #endif
1709 /**************************************************************************************************/
1710
1711 void ShhherCommand::calcNewDistancesChild(int startSeq, int stopSeq, vector<int>& child_otuIndex, vector<int>& child_seqIndex, vector<double>& child_singleTau){
1712         
1713         try{
1714                 vector<double> newTau(numOTUs,0);
1715                 vector<double> norms(numSeqs, 0);
1716                 child_otuIndex.resize(0);
1717                 child_seqIndex.resize(0);
1718                 child_singleTau.resize(0);
1719                 
1720                 for(int i=startSeq;i<stopSeq;i++){
1721                         double offset = 1e8;
1722                         int indexOffset = i * numOTUs;
1723                         
1724                         
1725                         for(int j=0;j<numOTUs;j++){
1726                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1727                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1728                                 }
1729                                 
1730                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1731                                         offset = dist[indexOffset + j];
1732                                 }
1733                         }
1734                         
1735                         for(int j=0;j<numOTUs;j++){
1736                                 if(weight[j] > MIN_WEIGHT){
1737                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1738                                         norms[i] += newTau[j];
1739                                 }
1740                                 else{
1741                                         newTau[j] = 0.0;
1742                                 }
1743                         }
1744                         
1745                         for(int j=0;j<numOTUs;j++){
1746                                 newTau[j] /= norms[i];
1747                                 
1748                                 if(newTau[j] > MIN_TAU){
1749                                         child_otuIndex.push_back(j);
1750                                         child_seqIndex.push_back(i);
1751                                         child_singleTau.push_back(newTau[j]);
1752                                 }
1753                         }
1754                 }
1755         }
1756         catch(exception& e) {
1757                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChild");
1758                 exit(1);        
1759         }               
1760 }
1761
1762 /**************************************************************************************************/
1763
1764 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1765         
1766         try{
1767                 
1768                 int total = 0;
1769                 vector<double> newTau(numOTUs,0);
1770                 vector<double> norms(numSeqs, 0);
1771                 nSeqsPerOTU.assign(numOTUs, 0);
1772                 
1773                 for(int i=startSeq;i<stopSeq;i++){
1774                         int indexOffset = i * numOTUs;
1775                         
1776                         double offset = 1e8;
1777                         
1778                         for(int j=0;j<numOTUs;j++){
1779                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1780                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1781                                 }
1782                                 
1783                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1784                                         offset = dist[indexOffset + j];
1785                                 }
1786                         }
1787                         
1788                         for(int j=0;j<numOTUs;j++){
1789                                 if(weight[j] > MIN_WEIGHT){
1790                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1791                                         norms[i] += newTau[j];
1792                                 }
1793                                 else{
1794                                         newTau[j] = 0.0;
1795                                 }
1796                         }
1797                         
1798                         for(int j=0;j<numOTUs;j++){
1799                                 newTau[j] /= norms[i];
1800                         }
1801                         
1802                         for(int j=0;j<numOTUs;j++){
1803                                 if(newTau[j] > MIN_TAU){
1804                                         
1805                                         int oldTotal = total;
1806                                         
1807                                         total++;
1808                                         
1809                                         singleTau.resize(total, 0);
1810                                         seqNumber.resize(total, 0);
1811                                         seqIndex.resize(total, 0);
1812                                         
1813                                         singleTau[oldTotal] = newTau[j];
1814                                         
1815                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
1816                                         aaI[j][nSeqsPerOTU[j]] = i;
1817                                         nSeqsPerOTU[j]++;
1818                                 }
1819                         }
1820                 }
1821         }
1822         catch(exception& e) {
1823                 m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1824                 exit(1);        
1825         }               
1826 }
1827
1828 /**************************************************************************************************/
1829
1830 void ShhherCommand::setOTUs(){
1831         
1832         try {
1833                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1834                 
1835                 for(int i=0;i<numOTUs;i++){
1836                         for(int j=0;j<nSeqsPerOTU[i];j++){
1837                                 int index = cumNumSeqs[i] + j;
1838                                 double tauValue = singleTau[seqNumber[index]];
1839                                 int sIndex = seqIndex[index];
1840                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1841                         }
1842                 }
1843                 
1844                 for(int i=0;i<numSeqs;i++){
1845                         double maxTau = -1.0000;
1846                         int maxOTU = -1;
1847                         for(int j=0;j<numOTUs;j++){
1848                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1849                                         maxTau = bigTauMatrix[i * numOTUs + j];
1850                                         maxOTU = j;
1851                                 }
1852                         }
1853                         
1854                         otuData[i] = maxOTU;
1855                 }
1856                 
1857                 nSeqsPerOTU.assign(numOTUs, 0);         
1858                 
1859                 for(int i=0;i<numSeqs;i++){
1860                         int index = otuData[i];
1861                         
1862                         singleTau[i] = 1.0000;
1863                         dist[i] = 0.0000;
1864                         
1865                         aaP[index][nSeqsPerOTU[index]] = i;
1866                         aaI[index][nSeqsPerOTU[index]] = i;
1867                         
1868                         nSeqsPerOTU[index]++;
1869                 }
1870                 fill(); 
1871         }
1872         catch(exception& e) {
1873                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
1874                 exit(1);        
1875         }               
1876 }
1877
1878 /**************************************************************************************************/
1879
1880 void ShhherCommand::writeQualities(vector<int> otuCounts){
1881         
1882         try {
1883                 string qualityFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.qual";
1884
1885                 ofstream qualityFile;
1886                 m->openOutputFile(qualityFileName, qualityFile);
1887
1888                 qualityFile.setf(ios::fixed, ios::floatfield);
1889                 qualityFile.setf(ios::showpoint);
1890                 qualityFile << setprecision(6);
1891                 
1892                 vector<vector<int> > qualities(numOTUs);
1893                 vector<double> pr(HOMOPS, 0);
1894                 
1895                 
1896                 for(int i=0;i<numOTUs;i++){
1897                         int index = 0;
1898                         int base = 0;
1899                         
1900                         if(nSeqsPerOTU[i] > 0){
1901                                 qualities[i].assign(1024, -1);
1902                                 
1903                                 while(index < numFlowCells){
1904                                         double maxPrValue = 1e8;
1905                                         short maxPrIndex = -1;
1906                                         double count = 0.0000;
1907                                         
1908                                         pr.assign(HOMOPS, 0);
1909                                         
1910                                         for(int j=0;j<nSeqsPerOTU[i];j++){
1911                                                 int lIndex = cumNumSeqs[i] + j;
1912                                                 double tauValue = singleTau[seqNumber[lIndex]];
1913                                                 int sequenceIndex = aaI[i][j];
1914                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1915                                                 
1916                                                 count += tauValue;
1917                                                 
1918                                                 for(int s=0;s<HOMOPS;s++){
1919                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1920                                                 }
1921                                         }
1922                                         
1923                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1924                                         maxPrValue = pr[maxPrIndex];
1925                                         
1926                                         if(count > MIN_COUNT){
1927                                                 double U = 0.0000;
1928                                                 double norm = 0.0000;
1929                                                 
1930                                                 for(int s=0;s<HOMOPS;s++){
1931                                                         norm += exp(-(pr[s] - maxPrValue));
1932                                                 }
1933                                                 
1934                                                 for(int s=1;s<=maxPrIndex;s++){
1935                                                         int value = 0;
1936                                                         double temp = 0.0000;
1937                                                         
1938                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
1939                                                         
1940                                                         if(U>0.00){
1941                                                                 temp = log10(U);
1942                                                         }
1943                                                         else{
1944                                                                 temp = -10.1;
1945                                                         }
1946                                                         temp = floor(-10 * temp);
1947                                                         value = (int)floor(temp);
1948                                                         if(value > 100){        value = 100;    }
1949                                                         
1950                                                         qualities[i][base] = (int)value;
1951                                                         base++;
1952                                                 }
1953                                         }
1954                                         
1955                                         index++;
1956                                 }
1957                         }
1958                         
1959                         
1960                         if(otuCounts[i] > 0){
1961                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1962                                 
1963                                 int j=4;        //need to get past the first four bases
1964                                 while(qualities[i][j] != -1){
1965                                         qualityFile << qualities[i][j] << ' ';
1966                                         j++;
1967                                 }
1968                                 qualityFile << endl;
1969                         }
1970                 }
1971                 qualityFile.close();
1972                 
1973         }
1974         catch(exception& e) {
1975                 m->errorOut(e, "ShhherCommand", "writeQualities");
1976                 exit(1);        
1977         }               
1978 }
1979
1980 /**************************************************************************************************/
1981
1982 void ShhherCommand::writeSequences(vector<int> otuCounts){
1983         try {
1984                 
1985                 string fastaFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.fasta";
1986                 ofstream fastaFile;
1987                 m->openOutputFile(fastaFileName, fastaFile);
1988                 
1989                 vector<string> names(numOTUs, "");
1990                 
1991                 for(int i=0;i<numOTUs;i++){
1992                         int index = centroids[i];
1993                         
1994                         if(otuCounts[i] > 0){
1995                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1996                                 
1997                                 string newSeq = "";
1998                                 
1999                                 for(int j=0;j<numFlowCells;j++){
2000                                         
2001                                         char base = flowOrder[j % 4];
2002                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
2003                                                 newSeq += base;
2004                                         }
2005                                 }
2006                                 
2007                                 fastaFile << newSeq.substr(4) << endl;
2008                         }
2009                 }
2010                 fastaFile.close();
2011                 
2012                 if(compositeFASTAFileName != ""){
2013                         m->appendFiles(fastaFileName, compositeFASTAFileName);
2014                 }
2015         }
2016         catch(exception& e) {
2017                 m->errorOut(e, "ShhherCommand", "writeSequences");
2018                 exit(1);        
2019         }               
2020 }
2021
2022 /**************************************************************************************************/
2023
2024 void ShhherCommand::writeNames(vector<int> otuCounts){
2025         try {
2026                 string nameFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.final.names";
2027                 ofstream nameFile;
2028                 m->openOutputFile(nameFileName, nameFile);
2029                 
2030                 for(int i=0;i<numOTUs;i++){
2031                         if(otuCounts[i] > 0){
2032                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
2033                                 
2034                                 for(int j=1;j<nSeqsPerOTU[i];j++){
2035                                         nameFile << ',' << seqNameVector[aaI[i][j]];
2036                                 }
2037                                 
2038                                 nameFile << endl;
2039                         }
2040                 }
2041                 nameFile.close();
2042         }
2043         catch(exception& e) {
2044                 m->errorOut(e, "ShhherCommand", "writeNames");
2045                 exit(1);        
2046         }               
2047 }
2048
2049 /**************************************************************************************************/
2050
2051 void ShhherCommand::writeGroups(){
2052         try {
2053                 string fileRoot = flowFileName.substr(0,flowFileName.find_last_of('.'));
2054                 string groupFileName = fileRoot + ".pn.groups";
2055                 ofstream groupFile;
2056                 m->openOutputFile(groupFileName, groupFile);
2057                 
2058                 for(int i=0;i<numSeqs;i++){
2059                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
2060                 }
2061                 groupFile.close();
2062         }
2063         catch(exception& e) {
2064                 m->errorOut(e, "ShhherCommand", "writeGroups");
2065                 exit(1);        
2066         }               
2067 }
2068
2069 /**************************************************************************************************/
2070
2071 void ShhherCommand::writeClusters(vector<int> otuCounts){
2072         try {
2073                 string otuCountsFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".pn.counts";
2074                 ofstream otuCountsFile;
2075                 m->openOutputFile(otuCountsFileName, otuCountsFile);
2076                 
2077                 string bases = flowOrder;
2078                 
2079                 for(int i=0;i<numOTUs;i++){
2080                         //output the translated version of the centroid sequence for the otu
2081                         if(otuCounts[i] > 0){
2082                                 int index = centroids[i];
2083                                 
2084                                 otuCountsFile << "ideal\t";
2085                                 for(int j=8;j<numFlowCells;j++){
2086                                         char base = bases[j % 4];
2087                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
2088                                                 otuCountsFile << base;
2089                                         }
2090                                 }
2091                                 otuCountsFile << endl;
2092                                 
2093                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2094                                         int sequence = aaI[i][j];
2095                                         otuCountsFile << seqNameVector[sequence] << '\t';
2096                                         
2097                                         string newSeq = "";
2098                                         
2099                                         for(int k=0;k<lengths[sequence];k++){
2100                                                 char base = bases[k % 4];
2101                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
2102                                                         
2103                                                 for(int s=0;s<freq;s++){
2104                                                         newSeq += base;
2105                                                         //otuCountsFile << base;
2106                                                 }
2107                                         }
2108                                         otuCountsFile << newSeq.substr(4) << endl;
2109                                 }
2110                                 otuCountsFile << endl;
2111                         }
2112                 }
2113                 otuCountsFile.close();
2114         }
2115         catch(exception& e) {
2116                 m->errorOut(e, "ShhherCommand", "writeClusters");
2117                 exit(1);        
2118         }               
2119 }
2120
2121 //**********************************************************************************************************************