]> git.donarmstrong.com Git - mothur.git/blob - shhhercommand.cpp
added mothurOutJustToScreen function and changed all counter outputs to use it.
[mothur.git] / shhhercommand.cpp
1 /*
2  *  shhher.cpp
3  *  Mothur
4  *
5  *  Created by Pat Schloss on 12/27/10.
6  *  Copyright 2010 Schloss Lab. All rights reserved.
7  *
8  */
9
10 #include "shhhercommand.h"
11
12 //**********************************************************************************************************************
13 vector<string> ShhherCommand::setParameters(){  
14         try {
15                 CommandParameter pflow("flow", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pflow);
16                 CommandParameter pfile("file", "InputTypes", "", "", "none", "fileflow", "none","fasta-name-group-counts-qfile",false,false,true); parameters.push_back(pfile);
17                 CommandParameter plookup("lookup", "InputTypes", "", "", "none", "none", "none","",false,false,true); parameters.push_back(plookup);
18                 CommandParameter pcutoff("cutoff", "Number", "", "0.01", "", "", "","",false,false); parameters.push_back(pcutoff);
19                 CommandParameter pprocessors("processors", "Number", "", "1", "", "", "","",false,false,true); parameters.push_back(pprocessors);
20                 CommandParameter pmaxiter("maxiter", "Number", "", "1000", "", "", "","",false,false); parameters.push_back(pmaxiter);
21         CommandParameter plarge("large", "Number", "", "-1", "", "", "","",false,false); parameters.push_back(plarge);
22                 CommandParameter psigma("sigma", "Number", "", "60", "", "", "","",false,false); parameters.push_back(psigma);
23                 CommandParameter pmindelta("mindelta", "Number", "", "0.000001", "", "", "","",false,false); parameters.push_back(pmindelta);
24         CommandParameter porder("order", "Multiple", "A-B", "A", "", "", "","",false,false, true); parameters.push_back(porder);                CommandParameter pinputdir("inputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(pinputdir);
25                 CommandParameter poutputdir("outputdir", "String", "", "", "", "", "","",false,false); parameters.push_back(poutputdir);
26                 
27                 vector<string> myArray;
28                 for (int i = 0; i < parameters.size(); i++) {   myArray.push_back(parameters[i].name);          }
29                 return myArray;
30         }
31         catch(exception& e) {
32                 m->errorOut(e, "ShhherCommand", "setParameters");
33                 exit(1);
34         }
35 }
36 //**********************************************************************************************************************
37 string ShhherCommand::getHelpString(){  
38         try {
39                 string helpString = "";
40                 helpString += "The shhh.flows command reads a file containing flowgrams and creates a file of corrected sequences.\n";
41         helpString += "The shhh.flows command parameters are flow, file, lookup, cutoff, processors, large, maxiter, sigma, mindelta and order.\n";
42         helpString += "The flow parameter is used to input your flow file.\n";
43         helpString += "The file parameter is used to input the *flow.files file created by trim.flows.\n";
44         helpString += "The lookup parameter is used specify the lookup file you would like to use. http://www.mothur.org/wiki/Lookup_files.\n";
45         helpString += "The order parameter options are A or B.  A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n";
46                 return helpString;
47         }
48         catch(exception& e) {
49                 m->errorOut(e, "ShhherCommand", "getHelpString");
50                 exit(1);
51         }
52 }
53 //**********************************************************************************************************************
54 string ShhherCommand::getOutputPattern(string type) {
55     try {
56         string pattern = "";
57         
58         if (type == "fasta")            {   pattern = "[filename],shhh.fasta";   }
59         else if (type == "name")    {   pattern = "[filename],shhh.names";   }
60         else if (type == "group")        {   pattern = "[filename],shhh.groups";   }
61         else if (type == "counts")        {   pattern = "[filename],shhh.counts";   }
62         else if (type == "qfile")        {   pattern = "[filename],shhh.qual";   }
63         else { m->mothurOut("[ERROR]: No definition for type " + type + " output pattern.\n"); m->control_pressed = true;  }
64         
65         return pattern;
66     }
67     catch(exception& e) {
68         m->errorOut(e, "ShhherCommand", "getOutputPattern");
69         exit(1);
70     }
71 }
72 //**********************************************************************************************************************
73
74 ShhherCommand::ShhherCommand(){ 
75         try {
76                 abort = true; calledHelp = true;
77                 setParameters();
78                 
79                 //initialize outputTypes
80                 vector<string> tempOutNames;
81                 outputTypes["fasta"] = tempOutNames;
82         outputTypes["name"] = tempOutNames;
83         outputTypes["group"] = tempOutNames;
84         outputTypes["counts"] = tempOutNames;
85         outputTypes["qfile"] = tempOutNames;
86
87         }
88         catch(exception& e) {
89                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
90                 exit(1);
91         }
92 }
93
94 //**********************************************************************************************************************
95
96 ShhherCommand::ShhherCommand(string option) {
97         try {
98         
99 #ifdef USE_MPI
100                 MPI_Comm_rank(MPI_COMM_WORLD, &pid); //find out who we are
101                 MPI_Comm_size(MPI_COMM_WORLD, &ncpus);
102         
103                 if(pid == 0){
104 #endif
105                 abort = false; calledHelp = false;   
106                 
107                 //allow user to run help
108                 if(option == "help") { help(); abort = true; calledHelp = true; }
109                 else if(option == "citation") { citation(); abort = true; calledHelp = true;}
110                 
111                 else {
112                         vector<string> myArray = setParameters();
113                         
114                         OptionParser parser(option);
115                         map<string,string> parameters = parser.getParameters();
116                         
117                         ValidParameters validParameter;
118                         map<string,string>::iterator it;
119                         
120                         //check to make sure all parameters are valid for command
121                         for (it = parameters.begin(); it != parameters.end(); it++) { 
122                                 if (validParameter.isValidParameter(it->first, myArray, it->second) != true) {  abort = true;  }
123                         }
124                         
125                         //initialize outputTypes
126             vector<string> tempOutNames;
127             outputTypes["fasta"] = tempOutNames;
128             outputTypes["name"] = tempOutNames;
129             outputTypes["group"] = tempOutNames;
130             outputTypes["counts"] = tempOutNames;
131             outputTypes["qfile"] = tempOutNames;
132
133                         
134                         //if the user changes the input directory command factory will send this info to us in the output parameter 
135                         string inputDir = validParameter.validFile(parameters, "inputdir", false);              
136                         if (inputDir == "not found"){   inputDir = "";          }
137                         else {
138                                 string path;
139                                 it = parameters.find("flow");
140                                 //user has given a template file
141                                 if(it != parameters.end()){ 
142                                         path = m->hasPath(it->second);
143                                         //if the user has not given a path then, add inputdir. else leave path alone.
144                                         if (path == "") {       parameters["flow"] = inputDir + it->second;             }
145                                 }
146                                 
147                                 it = parameters.find("lookup");
148                                 //user has given a template file
149                                 if(it != parameters.end()){ 
150                                         path = m->hasPath(it->second);
151                                         //if the user has not given a path then, add inputdir. else leave path alone.
152                                         if (path == "") {       parameters["lookup"] = inputDir + it->second;           }
153                                 }
154
155                                 it = parameters.find("file");
156                                 //user has given a template file
157                                 if(it != parameters.end()){ 
158                                         path = m->hasPath(it->second);
159                                         //if the user has not given a path then, add inputdir. else leave path alone.
160                                         if (path == "") {       parameters["file"] = inputDir + it->second;             }
161                                 }
162                         }
163             
164             //if the user changes the output directory command factory will send this info to us in the output parameter 
165                         outputDir = validParameter.validFile(parameters, "outputdir", false);           if (outputDir == "not found"){  outputDir = ""; }
166             
167                         //check for required parameters
168                         flowFileName = validParameter.validFile(parameters, "flow", true);
169                         flowFilesFileName = validParameter.validFile(parameters, "file", true);
170                         if (flowFileName == "not found" && flowFilesFileName == "not found") {
171                                 m->mothurOut("values for either flow or file must be provided for the shhh.flows command.");
172                                 m->mothurOutEndLine();
173                                 abort = true; 
174                         }
175                         else if (flowFileName == "not open" || flowFilesFileName == "not open") { abort = true; }
176                         
177                         if(flowFileName != "not found"){
178                                 compositeFASTAFileName = "";    
179                                 compositeNamesFileName = "";    
180                         }
181                         else{
182                                 ofstream temp;
183                 
184                 string thisoutputDir = outputDir;
185                 if (outputDir == "") {  thisoutputDir =  m->hasPath(flowFilesFileName); } //if user entered a file with a path then preserve it
186                 
187                                 //we want to rip off .files, and also .flow if its there
188                 string fileroot = m->getRootName(m->getSimpleName(flowFilesFileName));
189                 if (fileroot[fileroot.length()-1] == '.') {  fileroot = fileroot.substr(0, fileroot.length()-1); } //rip off dot
190                 string extension = m->getExtension(fileroot);
191                 if (extension == ".flow") { fileroot = m->getRootName(fileroot);  }
192                 else { fileroot += "."; } //add back if needed
193                 
194                                 compositeFASTAFileName = thisoutputDir + fileroot + "shhh.fasta";
195                                 m->openOutputFile(compositeFASTAFileName, temp);
196                                 temp.close();
197                                 
198                                 compositeNamesFileName = thisoutputDir + fileroot + "shhh.names";
199                                 m->openOutputFile(compositeNamesFileName, temp);
200                                 temp.close();
201                         }
202             
203             if(flowFilesFileName != "not found"){
204                 string fName;
205                 
206                 ifstream flowFilesFile;
207                 m->openInputFile(flowFilesFileName, flowFilesFile);
208                 while(flowFilesFile){
209                     fName = m->getline(flowFilesFile);
210                     
211                     //test if file is valid
212                     ifstream in;
213                     int ableToOpen = m->openInputFile(fName, in, "noerror");
214                     in.close(); 
215                     if (ableToOpen == 1) {
216                         if (inputDir != "") { //default path is set
217                             string tryPath = inputDir + fName;
218                             m->mothurOut("Unable to open " + fName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
219                             ifstream in2;
220                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
221                             in2.close();
222                             fName = tryPath;
223                         }
224                     }
225                     
226                     if (ableToOpen == 1) {
227                         if (m->getDefaultPath() != "") { //default path is set
228                             string tryPath = m->getDefaultPath() + m->getSimpleName(fName);
229                             m->mothurOut("Unable to open " + fName + ". Trying default " + tryPath); m->mothurOutEndLine();
230                             ifstream in2;
231                             ableToOpen = m->openInputFile(tryPath, in2, "noerror");
232                             in2.close();
233                             fName = tryPath;
234                         }
235                     }
236                     
237                     //if you can't open it its not in current working directory or inputDir, try mothur excutable location
238                     if (ableToOpen == 1) {
239                         string exepath = m->argv;
240                         string tempPath = exepath;
241                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
242                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
243                         
244                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(fName);
245                         m->mothurOut("Unable to open " + fName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
246                         ifstream in2;
247                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
248                         in2.close();
249                         fName = tryPath;
250                     }
251                     
252                     if (ableToOpen == 1) {  m->mothurOut("Unable to open " + fName + ". Disregarding. "); m->mothurOutEndLine();  }
253                     else { flowFileVector.push_back(fName); }
254                     m->gobble(flowFilesFile);
255                 }
256                 flowFilesFile.close();
257                 if (flowFileVector.size() == 0) {  m->mothurOut("[ERROR]: no valid files."); m->mothurOutEndLine(); abort = true; }
258             }
259             else{
260                 if (outputDir == "") { outputDir = m->hasPath(flowFileName); }
261                 flowFileVector.push_back(flowFileName);
262             }
263                 
264                         //check for optional parameter and set defaults
265                         // ...at some point should added some additional type checking...
266                         string temp;
267                         temp = validParameter.validFile(parameters, "lookup", true);
268                         if (temp == "not found")        {       
269                                 lookupFileName = "LookUp_Titanium.pat"; 
270                                 
271                                 int ableToOpen;
272                                 ifstream in;
273                                 ableToOpen = m->openInputFile(lookupFileName, in, "noerror");
274                                 in.close();     
275                                 
276                                 //if you can't open it, try input location
277                                 if (ableToOpen == 1) {
278                                         if (inputDir != "") { //default path is set
279                                                 string tryPath = inputDir + lookupFileName;
280                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying input directory " + tryPath); m->mothurOutEndLine();
281                                                 ifstream in2;
282                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
283                                                 in2.close();
284                                                 lookupFileName = tryPath;
285                                         }
286                                 }
287                                 
288                                 //if you can't open it, try default location
289                                 if (ableToOpen == 1) {
290                                         if (m->getDefaultPath() != "") { //default path is set
291                                                 string tryPath = m->getDefaultPath() + m->getSimpleName(lookupFileName);
292                                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying default " + tryPath); m->mothurOutEndLine();
293                                                 ifstream in2;
294                                                 ableToOpen = m->openInputFile(tryPath, in2, "noerror");
295                                                 in2.close();
296                                                 lookupFileName = tryPath;
297                                         }
298                                 }
299                                 
300                                 //if you can't open it its not in current working directory or inputDir, try mothur excutable location
301                                 if (ableToOpen == 1) {
302                                         string exepath = m->argv;
303                                         string tempPath = exepath;
304                                         for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
305                                         exepath = exepath.substr(0, (tempPath.find_last_of('m')));
306                                         
307                                         string tryPath = m->getFullPathName(exepath) + m->getSimpleName(lookupFileName);
308                                         m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
309                                         ifstream in2;
310                                         ableToOpen = m->openInputFile(tryPath, in2, "noerror");
311                                         in2.close();
312                                         lookupFileName = tryPath;
313                                 }
314                                 
315                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
316                         }
317                         else if(temp == "not open")     {       
318                                 
319                                 lookupFileName = validParameter.validFile(parameters, "lookup", false);
320                                 
321                                 //if you can't open it its not inputDir, try mothur excutable location
322                                 string exepath = m->argv;
323                                 string tempPath = exepath;
324                                 for (int i = 0; i < exepath.length(); i++) { tempPath[i] = tolower(exepath[i]); }
325                                 exepath = exepath.substr(0, (tempPath.find_last_of('m')));
326                                         
327                                 string tryPath = m->getFullPathName(exepath) + lookupFileName;
328                                 m->mothurOut("Unable to open " + lookupFileName + ". Trying mothur's executable location " + tryPath); m->mothurOutEndLine();
329                                 ifstream in2;
330                                 int ableToOpen = m->openInputFile(tryPath, in2, "noerror");
331                                 in2.close();
332                                 lookupFileName = tryPath;
333                                 
334                                 if (ableToOpen == 1) {  m->mothurOut("Unable to open " + lookupFileName + "."); m->mothurOutEndLine(); abort=true;  }
335                         }else                                           {       lookupFileName = temp;  }
336                         
337                         temp = validParameter.validFile(parameters, "processors", false);       if (temp == "not found"){       temp = m->getProcessors();      }
338                         m->setProcessors(temp);
339                         m->mothurConvert(temp, processors);
340
341                         temp = validParameter.validFile(parameters, "cutoff", false);   if (temp == "not found"){       temp = "0.01";          }
342                         m->mothurConvert(temp, cutoff); 
343                         
344                         temp = validParameter.validFile(parameters, "mindelta", false); if (temp == "not found"){       temp = "0.000001";      }
345                         m->mothurConvert(temp, minDelta); 
346
347                         temp = validParameter.validFile(parameters, "maxiter", false);  if (temp == "not found"){       temp = "1000";          }
348                         m->mothurConvert(temp, maxIters); 
349             
350             temp = validParameter.validFile(parameters, "large", false);        if (temp == "not found"){       temp = "0";             }
351                         m->mothurConvert(temp, largeSize); 
352             if (largeSize != 0) { large = true; }
353             else { large = false;  }
354             if (largeSize < 0) {  m->mothurOut("The value of the large cannot be negative.\n"); }
355             
356 #ifdef USE_MPI
357             if (large) { m->mothurOut("The large parameter is not available with the MPI-Enabled version.\n"); large=false; }                           
358 #endif
359             
360
361                         temp = validParameter.validFile(parameters, "sigma", false);if (temp == "not found")    {       temp = "60";            }
362                         m->mothurConvert(temp, sigma); 
363                         
364                         temp = validParameter.validFile(parameters, "order", false);  if (temp == "not found"){         temp = "A";     }
365             if (temp.length() > 1) {  m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A or B. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n");  abort=true;
366             }
367             else {
368                 if (toupper(temp[0]) == 'A') {  flowOrder = "TACG";   }
369                 else if(toupper(temp[0]) == 'B'){ 
370                     flowOrder = "TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC";   }
371                 else {
372                     m->mothurOut("[ERROR]: " + temp + " is not a valid option for order. order options are A or B. A = TACG and B = TACGTACGTACGATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATAGATCGCATGACGATCGCATATCGTCAGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGTAGTCGAGCATCATCTGACGCAGTACGTGCATGATCTCAGTCAGCAGCTATGTCAGTGCATGCATAGATCGCATGACGATCGCATATCGTCAGTGCAGTGACTGATCGTCATCAGCTAGCATCGACTGCATGATCTCAGTCAGCAGC.\n");  abort=true;
373                 }
374             }
375
376                         
377                 }
378 #ifdef USE_MPI
379                 }                               
380 #endif
381         }
382         catch(exception& e) {
383                 m->errorOut(e, "ShhherCommand", "ShhherCommand");
384                 exit(1);
385         }
386 }
387 //**********************************************************************************************************************
388 #ifdef USE_MPI
389 int ShhherCommand::execute(){
390         try {
391                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
392                 
393                 int tag = 1976;
394                 MPI_Status status; 
395                         
396                 if(pid == 0){
397
398                         for(int i=1;i<ncpus;i++){
399                                 MPI_Send(&abort, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
400                         }
401                         if(abort == 1){ return 0;       }
402
403                         processors = ncpus;
404                         
405                         m->mothurOut("\nGetting preliminary data...\n");
406                         getSingleLookUp();      if (m->control_pressed) { return 0; }
407                         getJointLookUp();       if (m->control_pressed) { return 0; }
408                         
409             vector<string> flowFileVector;
410                         if(flowFilesFileName != "not found"){
411                                 string fName;
412                 
413                                 ifstream flowFilesFile;
414                                 m->openInputFile(flowFilesFileName, flowFilesFile);
415                                 while(flowFilesFile){
416                                         fName = m->getline(flowFilesFile);
417                                         flowFileVector.push_back(fName);
418                                         m->gobble(flowFilesFile);
419                                 }
420                         }
421                         else{
422                                 flowFileVector.push_back(flowFileName);
423                         }
424             
425                         int numFiles = flowFileVector.size();
426
427                         for(int i=1;i<ncpus;i++){
428                                 MPI_Send(&numFiles, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
429                         }
430                         
431                         for(int i=0;i<numFiles;i++){
432                                 
433                                 if (m->control_pressed) { break; }
434                                 
435                                 double begClock = clock();
436                                 unsigned long long begTime = time(NULL);
437
438                                 flowFileName = flowFileVector[i];
439                                 
440                                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(numFiles) + ")\t<<<<<\n");
441                                 m->mothurOut("Reading flowgrams...\n");
442                                 getFlowData();
443                                 
444                                 if (m->control_pressed) { break; }
445
446                                 m->mothurOut("Identifying unique flowgrams...\n");
447                                 getUniques();
448                                 
449                                 if (m->control_pressed) { break; }
450
451                                 m->mothurOut("Calculating distances between flowgrams...\n");
452                                 char fileName[1024];
453                                 strcpy(fileName, flowFileName.c_str());
454
455                                 for(int i=1;i<ncpus;i++){
456                                         MPI_Send(&fileName[0], 1024, MPI_CHAR, i, tag, MPI_COMM_WORLD);
457
458                                         MPI_Send(&numSeqs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
459                                         MPI_Send(&numUniques, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
460                                         MPI_Send(&numFlowCells, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
461                                         MPI_Send(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, i, tag, MPI_COMM_WORLD);
462                                         MPI_Send(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
463                                         MPI_Send(&mapUniqueToSeq[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
464                                         MPI_Send(&mapSeqToUnique[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
465                                         MPI_Send(&lengths[0], numSeqs, MPI_INT, i, tag, MPI_COMM_WORLD);
466                                         MPI_Send(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
467                                         MPI_Send(&cutoff, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
468                                 }                               
469                                                         
470                                 string distFileName = flowDistMPI(0, int(sqrt(1.0/float(ncpus)) * numUniques));
471                                 
472                                 if (m->control_pressed) { break; }
473                                 
474                                 int done;
475                                 for(int i=1;i<ncpus;i++){
476                                         MPI_Recv(&done, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
477                                         
478                                         m->appendFiles((distFileName + ".temp." + toString(i)), distFileName);
479                                         m->mothurRemove((distFileName + ".temp." + toString(i)));
480                                 }
481
482                                 string namesFileName = createNamesFile();
483                                 
484                                 if (m->control_pressed) { break; }
485                                 
486                                 m->mothurOut("\nClustering flowgrams...\n");
487                                 string listFileName = cluster(distFileName, namesFileName);
488                                 
489                                 if (m->control_pressed) { break; }
490                                 
491                 
492                 
493                                 getOTUData(listFileName);
494
495                                 m->mothurRemove(distFileName);
496                                 m->mothurRemove(namesFileName);
497                                 m->mothurRemove(listFileName);
498                                 
499                                 if (m->control_pressed) { break; }
500                                 
501                                 initPyroCluster();
502
503                                 if (m->control_pressed) { break; }
504                                 
505             
506                                 for(int i=1;i<ncpus;i++){
507                                         MPI_Send(&numOTUs, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
508                                         MPI_Send(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
509                                         MPI_Send(&uniqueFlowgrams[0], numFlowCells * numUniques, MPI_SHORT, i, tag, MPI_COMM_WORLD);
510                                         MPI_Send(&sigma, 1, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
511                                 }
512                                 
513                                 if (m->control_pressed) { break; }
514                                 
515                                 double maxDelta = 0;
516                                 int iter = 0;
517                                 
518                                 int numOTUsOnCPU = numOTUs / ncpus;
519                                 int numSeqsOnCPU = numSeqs / ncpus;
520                                 m->mothurOut("\nDenoising flowgrams...\n");
521                                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
522                                 
523                                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
524                                         
525                                         double cycClock = clock();
526                                         unsigned long long cycTime = time(NULL);
527                                         fill();
528                                         
529                                         if (m->control_pressed) { break; }
530
531                                         int total = singleTau.size();
532                                         for(int i=1;i<ncpus;i++){
533                                                 MPI_Send(&total, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
534                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
535                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
536                                                 
537                                                 MPI_Send(&singleTau[0], total, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
538                                                 MPI_Send(&seqNumber[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
539                                                 MPI_Send(&seqIndex[0], total, MPI_INT, i, tag, MPI_COMM_WORLD);
540                                                 MPI_Send(&nSeqsPerOTU[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
541                                                 MPI_Send(&cumNumSeqs[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
542                                         }
543                                         
544                                         calcCentroidsDriver(0, numOTUsOnCPU);
545                                         
546                                         for(int i=1;i<ncpus;i++){
547                                                 int otuStart = i * numOTUs / ncpus;
548                                                 int otuStop = (i + 1) * numOTUs / ncpus;
549                                                 
550                                                 vector<int> tempCentroids(numOTUs, 0);
551                                                 vector<short> tempChange(numOTUs, 0);
552                                                 
553                                                 MPI_Recv(&tempCentroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
554                                                 MPI_Recv(&tempChange[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD, &status);
555                                                 
556                                                 for(int j=otuStart;j<otuStop;j++){
557                                                         centroids[j] = tempCentroids[j];
558                                                         change[j] = tempChange[j];
559                                                 }
560                                         }
561                                                                         
562                                         maxDelta = getNewWeights(); if (m->control_pressed) { break; }
563                                         double nLL = getLikelihood(); if (m->control_pressed) { break; }
564                                         checkCentroids(); if (m->control_pressed) { break; }
565                                         
566                                         for(int i=1;i<ncpus;i++){
567                                                 MPI_Send(&centroids[0], numOTUs, MPI_INT, i, tag, MPI_COMM_WORLD);
568                                                 MPI_Send(&weight[0], numOTUs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD);
569                                                 MPI_Send(&change[0], numOTUs, MPI_SHORT, i, tag, MPI_COMM_WORLD);
570                                         }
571                                         
572                                         calcNewDistancesParent(0, numSeqsOnCPU);
573
574                                         total = singleTau.size();
575
576                                         for(int i=1;i<ncpus;i++){
577                                                 int childTotal;
578                                                 int seqStart = i * numSeqs / ncpus;
579                                                 int seqStop = (i + 1) * numSeqs / ncpus;
580                                                 
581                                                 MPI_Recv(&childTotal, 1, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
582
583                                                 vector<int> childSeqIndex(childTotal, 0);
584                                                 vector<double> childSingleTau(childTotal, 0);
585                                                 vector<double> childDist(numSeqs * numOTUs, 0);
586                                                 vector<int> otuIndex(childTotal, 0);
587                                                 
588                                                 MPI_Recv(&childSeqIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
589                                                 MPI_Recv(&childSingleTau[0], childTotal, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
590                                                 MPI_Recv(&childDist[0], numOTUs * numSeqs, MPI_DOUBLE, i, tag, MPI_COMM_WORLD, &status);
591                                                 MPI_Recv(&otuIndex[0], childTotal, MPI_INT, i, tag, MPI_COMM_WORLD, &status);
592
593                                                 int oldTotal = total;
594                                                 total += childTotal;
595                                                 singleTau.resize(total, 0);
596                                                 seqIndex.resize(total, 0);
597                                                 seqNumber.resize(total, 0);
598                                                 
599                                                 int childIndex = 0;
600                                                 
601                                                 for(int j=oldTotal;j<total;j++){
602                                                         int otuI = otuIndex[childIndex];
603                                                         int seqI = childSeqIndex[childIndex];
604                                                         
605                                                         singleTau[j] = childSingleTau[childIndex];
606                                                         
607                                                         aaP[otuI][nSeqsPerOTU[otuI]] = j;
608                                                         aaI[otuI][nSeqsPerOTU[otuI]] = seqI;
609                                                         nSeqsPerOTU[otuI]++;
610                                                         childIndex++;
611                                                 }
612                                                 
613                                                 int index = seqStart * numOTUs;
614                                                 for(int j=seqStart;j<seqStop;j++){
615                                                         for(int k=0;k<numOTUs;k++){
616                                                                 dist[index] = childDist[index];
617                                                                 index++;
618                                                         }
619                                                 }                                       
620                                         }
621                                         
622                                         iter++;
623                                         
624                                         m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');                  
625
626                                         if((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
627                                                 int live = 1;
628                                                 for(int i=1;i<ncpus;i++){
629                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD);
630                                                 }
631                                         }
632                                         else{
633                                                 int live = 0;
634                                                 for(int i=1;i<ncpus;i++){
635                                                         MPI_Send(&live, 1, MPI_INT, i, tag, MPI_COMM_WORLD); //send kill command
636                                                 }
637                                         }
638                                         
639                                 }       
640                                 
641                                 if (m->control_pressed) { break; }
642                                 
643                                 m->mothurOut("\nFinalizing...\n");
644                                 fill();
645                                 
646                                 if (m->control_pressed) { break; }
647                                 
648                                 setOTUs();
649                                 
650                                 vector<int> otuCounts(numOTUs, 0);
651                                 for(int i=0;i<numSeqs;i++)      {       otuCounts[otuData[i]]++;        }
652                                 calcCentroidsDriver(0, numOTUs);
653                                 
654                                 if (m->control_pressed) { break; }
655                                 
656                                 writeQualities(otuCounts);      if (m->control_pressed) { break; }
657                                 writeSequences(otuCounts);      if (m->control_pressed) { break; }
658                                 writeNames(otuCounts);          if (m->control_pressed) { break; }
659                                 writeClusters(otuCounts);       if (m->control_pressed) { break; }
660                                 writeGroups();                          if (m->control_pressed) { break; }
661                                 
662                                                                  
663                                 m->mothurOut("Total time to process " + toString(flowFileName) + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');                 
664                         }
665                 }
666                 else{
667                         int abort = 1;
668
669                         MPI_Recv(&abort, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
670                         if(abort){      return 0;       }
671
672                         int numFiles;
673                         MPI_Recv(&numFiles, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
674
675                         for(int i=0;i<numFiles;i++){
676                                 
677                                 if (m->control_pressed) { break; }
678                                 
679                                 //Now into the pyrodist part
680                                 bool live = 1;
681
682                                 char fileName[1024];
683                                 MPI_Recv(&fileName, 1024, MPI_CHAR, 0, tag, MPI_COMM_WORLD, &status);
684                                 MPI_Recv(&numSeqs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
685                                 MPI_Recv(&numUniques, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
686                                 MPI_Recv(&numFlowCells, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
687                                 
688                                 flowDataIntI.resize(numSeqs * numFlowCells);
689                                 flowDataPrI.resize(numSeqs * numFlowCells);
690                                 mapUniqueToSeq.resize(numSeqs);
691                                 mapSeqToUnique.resize(numSeqs);
692                                 lengths.resize(numSeqs);
693                                 jointLookUp.resize(NUMBINS * NUMBINS);
694
695                                 MPI_Recv(&flowDataIntI[0], numSeqs * numFlowCells, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
696                                 MPI_Recv(&flowDataPrI[0], numSeqs * numFlowCells, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
697                                 MPI_Recv(&mapUniqueToSeq[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
698                                 MPI_Recv(&mapSeqToUnique[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
699                                 MPI_Recv(&lengths[0], numSeqs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
700                                 MPI_Recv(&jointLookUp[0], NUMBINS * NUMBINS, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
701                                 MPI_Recv(&cutoff, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
702
703                                 flowFileName = string(fileName);
704                                 int flowDistStart = int(sqrt(float(pid)/float(ncpus)) * numUniques);
705                                 int flowDistEnd = int(sqrt(float(pid+1)/float(ncpus)) * numUniques);
706                                 
707                                 string distanceStringChild = flowDistMPI(flowDistStart, flowDistEnd);
708                                 
709                                 if (m->control_pressed) { break; }
710                                 
711                                 int done = 1;
712                                 MPI_Send(&done, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
713
714                                 //Now into the pyronoise part
715                                 MPI_Recv(&numOTUs, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
716                                 
717                                 singleLookUp.resize(HOMOPS * NUMBINS);
718                                 uniqueFlowgrams.resize(numUniques * numFlowCells);
719                                 weight.resize(numOTUs);
720                                 centroids.resize(numOTUs);
721                                 change.resize(numOTUs);
722                                 dist.assign(numOTUs * numSeqs, 0);
723                                 nSeqsPerOTU.resize(numOTUs);
724                                 cumNumSeqs.resize(numOTUs);
725
726                                 MPI_Recv(&singleLookUp[0], singleLookUp.size(), MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
727                                 MPI_Recv(&uniqueFlowgrams[0], uniqueFlowgrams.size(), MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
728                                 MPI_Recv(&sigma, 1, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
729                                 
730                                 int startOTU = pid * numOTUs / ncpus;
731                                 int endOTU = (pid + 1) * numOTUs / ncpus;
732
733                                 int startSeq = pid * numSeqs / ncpus;
734                                 int endSeq = (pid + 1) * numSeqs /ncpus;
735                                 
736                                 int total;
737
738                                 while(live){
739                                         
740                                         if (m->control_pressed) { break; }
741                                         
742                                         MPI_Recv(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
743                                         singleTau.assign(total, 0.0000);
744                                         seqNumber.assign(total, 0);
745                                         seqIndex.assign(total, 0);
746
747                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
748                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
749                                         MPI_Recv(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
750                                         MPI_Recv(&seqNumber[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
751                                         MPI_Recv(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
752                                         MPI_Recv(&nSeqsPerOTU[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
753                                         MPI_Recv(&cumNumSeqs[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
754
755                                         calcCentroidsDriver(startOTU, endOTU);
756
757                                         MPI_Send(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD);
758                                         MPI_Send(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD);
759
760                                         MPI_Recv(&centroids[0], numOTUs, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
761                                         MPI_Recv(&weight[0], numOTUs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD, &status);
762                                         MPI_Recv(&change[0], numOTUs, MPI_SHORT, 0, tag, MPI_COMM_WORLD, &status);
763
764                                         vector<int> otuIndex(total, 0);
765                                         calcNewDistancesChildMPI(startSeq, endSeq, otuIndex);
766                                         total = otuIndex.size();
767                                         
768                                         MPI_Send(&total, 1, MPI_INT, 0, tag, MPI_COMM_WORLD);
769                                         MPI_Send(&seqIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
770                                         MPI_Send(&singleTau[0], total, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
771                                         MPI_Send(&dist[0], numOTUs * numSeqs, MPI_DOUBLE, 0, tag, MPI_COMM_WORLD);
772                                         MPI_Send(&otuIndex[0], total, MPI_INT, 0, tag, MPI_COMM_WORLD);
773                                 
774                                         MPI_Recv(&live, 1, MPI_INT, 0, tag, MPI_COMM_WORLD, &status);
775                                 }
776                         }
777                 }
778                 
779                 if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
780                 
781                 MPI_Barrier(MPI_COMM_WORLD);
782
783                 
784                 if(compositeFASTAFileName != ""){
785                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
786                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName); 
787                 }
788
789                 m->mothurOutEndLine();
790                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
791                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
792                 m->mothurOutEndLine();
793                 
794                 return 0;
795
796         }
797         catch(exception& e) {
798                 m->errorOut(e, "ShhherCommand", "execute");
799                 exit(1);
800         }
801 }
802 /**************************************************************************************************/
803 string ShhherCommand::createNamesFile(){
804         try{
805                 
806                 vector<string> duplicateNames(numUniques, "");
807                 for(int i=0;i<numSeqs;i++){
808                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
809                 }
810                 map<string, string> variables; 
811                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
812                 string nameFileName = getOutputFileName("name",variables);
813                 
814                 ofstream nameFile;
815                 m->openOutputFile(nameFileName, nameFile);
816                 
817                 for(int i=0;i<numUniques;i++){
818                         
819                         if (m->control_pressed) { break; }
820                         
821             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
822                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
823                 }
824                 
825                 nameFile.close();
826                 return  nameFileName;
827         }
828         catch(exception& e) {
829                 m->errorOut(e, "ShhherCommand", "createNamesFile");
830                 exit(1);
831         }
832 }
833 /**************************************************************************************************/
834
835 string ShhherCommand::flowDistMPI(int startSeq, int stopSeq){
836         try{            
837                 ostringstream outStream;
838                 outStream.setf(ios::fixed, ios::floatfield);
839                 outStream.setf(ios::dec, ios::basefield);
840                 outStream.setf(ios::showpoint);
841                 outStream.precision(6);
842                 
843                 int begTime = time(NULL);
844                 double begClock = clock();
845                 
846                 for(int i=startSeq;i<stopSeq;i++){
847                         
848                         if (m->control_pressed) { break; }
849                         
850                         for(int j=0;j<i;j++){
851                                 float flowDistance = calcPairwiseDist(mapUniqueToSeq[i], mapUniqueToSeq[j]);
852                                 
853                                 if(flowDistance < 1e-6){
854                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
855                                 }
856                                 else if(flowDistance <= cutoff){
857                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
858                                 }
859                         }
860                         if(i % 100 == 0){
861                                 m->mothurOutJustToScreen(toString(i) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
862                         }
863                 }
864                 
865                 string fDistFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
866                 if(pid != 0){   fDistFileName += ".temp." + toString(pid);      }
867                 
868                 if (m->control_pressed) { return fDistFileName; }
869                 
870                 m->mothurOutJustToScreen(toString(stopSeq) + '\t' + toString(time(NULL) - begTime) + '\t' + toString((clock()-begClock)/CLOCKS_PER_SEC) + '\n');
871
872                 ofstream distFile(fDistFileName.c_str());
873                 distFile << outStream.str();            
874                 distFile.close();
875                 
876                 return fDistFileName;
877         }
878         catch(exception& e) {
879                 m->errorOut(e, "ShhherCommand", "flowDistMPI");
880                 exit(1);
881         }
882 }
883 /**************************************************************************************************/
884  
885 void ShhherCommand::getOTUData(string listFileName){
886     try {
887         
888         ifstream listFile;
889         m->openInputFile(listFileName, listFile);
890         string label;
891         
892         listFile >> label >> numOTUs;
893         
894         otuData.assign(numSeqs, 0);
895         cumNumSeqs.assign(numOTUs, 0);
896         nSeqsPerOTU.assign(numOTUs, 0);
897         aaP.clear();aaP.resize(numOTUs);
898         
899         seqNumber.clear();
900         aaI.clear();
901         seqIndex.clear();
902         
903         string singleOTU = "";
904         
905         for(int i=0;i<numOTUs;i++){
906             
907             if (m->control_pressed) { break; }
908             
909             listFile >> singleOTU;
910             
911             istringstream otuString(singleOTU);
912             
913             while(otuString){
914                 
915                 string seqName = "";
916                 
917                 for(int j=0;j<singleOTU.length();j++){
918                     char letter = otuString.get();
919                     
920                     if(letter != ','){
921                         seqName += letter;
922                     }
923                     else{
924                         map<string,int>::iterator nmIt = nameMap.find(seqName);
925                         int index = nmIt->second;
926                         
927                         nameMap.erase(nmIt);
928                         
929                         otuData[index] = i;
930                         nSeqsPerOTU[i]++;
931                         aaP[i].push_back(index);
932                         seqName = "";
933                     }
934                 }
935                 
936                 map<string,int>::iterator nmIt = nameMap.find(seqName);
937                 
938                 int index = nmIt->second;
939                 nameMap.erase(nmIt);
940                 
941                 otuData[index] = i;
942                 nSeqsPerOTU[i]++;
943                 aaP[i].push_back(index);        
944                 
945                 otuString.get();
946             }
947             
948             sort(aaP[i].begin(), aaP[i].end());
949             for(int j=0;j<nSeqsPerOTU[i];j++){
950                 seqNumber.push_back(aaP[i][j]);
951             }
952             for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
953                 aaP[i].push_back(0);
954             }
955             
956             
957         }
958         
959         for(int i=1;i<numOTUs;i++){
960             cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
961         }
962         aaI = aaP;
963         seqIndex = seqNumber;
964         
965         listFile.close();       
966         
967     }
968     catch(exception& e) {
969         m->errorOut(e, "ShhherCommand", "getOTUData");
970         exit(1);        
971     }           
972 }
973
974 /**************************************************************************************************/
975
976 void ShhherCommand::initPyroCluster(){                          
977     try{
978         if (numOTUs < processors) { processors = 1; }
979         
980         if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
981         
982         dist.assign(numSeqs * numOTUs, 0);
983         change.assign(numOTUs, 1);
984         centroids.assign(numOTUs, -1);
985         weight.assign(numOTUs, 0);
986         singleTau.assign(numSeqs, 1.0);
987         
988         nSeqsBreaks.assign(processors+1, 0);
989         nOTUsBreaks.assign(processors+1, 0);
990         
991         if (m->debug) { m->mothurOut("[DEBUG]: made it through the memory allocation.\n"); }
992         
993         nSeqsBreaks[0] = 0;
994         for(int i=0;i<processors;i++){
995             nSeqsBreaks[i+1] = nSeqsBreaks[i] + (int)((double) numSeqs / (double) processors);
996             nOTUsBreaks[i+1] = nOTUsBreaks[i] + (int)((double) numOTUs / (double) processors);
997         }
998         nSeqsBreaks[processors] = numSeqs;
999         nOTUsBreaks[processors] = numOTUs;
1000     }
1001     catch(exception& e) {
1002         m->errorOut(e, "ShhherCommand", "initPyroCluster");
1003         exit(1);        
1004     }
1005 }
1006
1007 /**************************************************************************************************/
1008
1009 void ShhherCommand::fill(){
1010     try {
1011         int index = 0;
1012         for(int i=0;i<numOTUs;i++){
1013             
1014             if (m->control_pressed) { break; }
1015             
1016             cumNumSeqs[i] = index;
1017             for(int j=0;j<nSeqsPerOTU[i];j++){
1018                 seqNumber[index] = aaP[i][j];
1019                 seqIndex[index] = aaI[i][j];
1020                 
1021                 index++;
1022             }
1023         }
1024     }
1025     catch(exception& e) {
1026         m->errorOut(e, "ShhherCommand", "fill");
1027         exit(1);        
1028     }           
1029 }
1030
1031 /**************************************************************************************************/
1032  
1033 void ShhherCommand::getFlowData(){
1034     try{
1035         ifstream flowFile;
1036         m->openInputFile(flowFileName, flowFile);
1037         
1038         string seqName;
1039         seqNameVector.clear();
1040         lengths.clear();
1041         flowDataIntI.clear();
1042         nameMap.clear();
1043         
1044         
1045         int currentNumFlowCells;
1046         
1047         float intensity;
1048         
1049         string numFlowTest;
1050         flowFile >> numFlowTest;
1051         
1052         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
1053         else { convert(numFlowTest, numFlowCells); }
1054         
1055         int index = 0;//pcluster
1056         while(!flowFile.eof()){
1057             
1058             if (m->control_pressed) { break; }
1059             
1060             flowFile >> seqName >> currentNumFlowCells;
1061             lengths.push_back(currentNumFlowCells);
1062             
1063             seqNameVector.push_back(seqName);
1064             nameMap[seqName] = index++;//pcluster
1065             
1066             for(int i=0;i<numFlowCells;i++){
1067                 flowFile >> intensity;
1068                 if(intensity > 9.99)    {       intensity = 9.99;       }
1069                 int intI = int(100 * intensity + 0.0001);
1070                 flowDataIntI.push_back(intI);
1071             }
1072             m->gobble(flowFile);
1073         }
1074         flowFile.close();
1075         
1076         numSeqs = seqNameVector.size();         
1077         
1078         for(int i=0;i<numSeqs;i++){
1079             
1080             if (m->control_pressed) { break; }
1081             
1082             int iNumFlowCells = i * numFlowCells;
1083             for(int j=lengths[i];j<numFlowCells;j++){
1084                 flowDataIntI[iNumFlowCells + j] = 0;
1085             }
1086         }
1087         
1088     }
1089     catch(exception& e) {
1090         m->errorOut(e, "ShhherCommand", "getFlowData");
1091         exit(1);
1092     }
1093 }
1094 /**************************************************************************************************/
1095 void ShhherCommand::calcNewDistancesChildMPI(int startSeq, int stopSeq, vector<int>& otuIndex){
1096         
1097         try{
1098                 vector<double> newTau(numOTUs,0);
1099                 vector<double> norms(numSeqs, 0);
1100                 otuIndex.clear();
1101                 seqIndex.clear();
1102                 singleTau.clear();
1103                 
1104                 for(int i=startSeq;i<stopSeq;i++){
1105                         
1106                         if (m->control_pressed) { break; }
1107                         
1108                         double offset = 1e8;
1109                         int indexOffset = i * numOTUs;
1110                         
1111                         for(int j=0;j<numOTUs;j++){
1112                                 
1113                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1114                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1115                                 }
1116                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1117                                         offset = dist[indexOffset + j];
1118                                 }
1119                         }
1120                         
1121                         for(int j=0;j<numOTUs;j++){
1122                                 if(weight[j] > MIN_WEIGHT){
1123                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1124                                         norms[i] += newTau[j];
1125                                 }
1126                                 else{
1127                                         newTau[j] = 0.0;
1128                                 }
1129                         }
1130                         
1131                         for(int j=0;j<numOTUs;j++){
1132                 
1133                                 newTau[j] /= norms[i];
1134                                 
1135                                 if(newTau[j] > MIN_TAU){
1136                                         otuIndex.push_back(j);
1137                                         seqIndex.push_back(i);
1138                                         singleTau.push_back(newTau[j]);
1139                                 }
1140                         }
1141                         
1142                 }
1143         }
1144         catch(exception& e) {
1145                 m->errorOut(e, "ShhherCommand", "calcNewDistancesChildMPI");
1146                 exit(1);        
1147         }               
1148 }
1149
1150 /**************************************************************************************************/
1151
1152 void ShhherCommand::calcNewDistancesParent(int startSeq, int stopSeq){
1153     
1154     try{
1155         
1156         int total = 0;
1157         vector<double> newTau(numOTUs,0);
1158         vector<double> norms(numSeqs, 0);
1159         nSeqsPerOTU.assign(numOTUs, 0);
1160         
1161         for(int i=startSeq;i<stopSeq;i++){
1162             
1163             if (m->control_pressed) { break; }
1164             
1165             int indexOffset = i * numOTUs;
1166             
1167             double offset = 1e8;
1168             
1169             for(int j=0;j<numOTUs;j++){
1170                 
1171                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
1172                     dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i]);
1173                 }
1174                 
1175                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
1176                     offset = dist[indexOffset + j];
1177                 }
1178             }
1179             
1180             for(int j=0;j<numOTUs;j++){
1181                 if(weight[j] > MIN_WEIGHT){
1182                     newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
1183                     norms[i] += newTau[j];
1184                 }
1185                 else{
1186                     newTau[j] = 0.0;
1187                 }
1188             }
1189             
1190             for(int j=0;j<numOTUs;j++){
1191                 newTau[j] /= norms[i];
1192             }
1193             
1194             for(int j=0;j<numOTUs;j++){
1195                 if(newTau[j] > MIN_TAU){
1196                     
1197                     int oldTotal = total;
1198                     
1199                     total++;
1200                     
1201                     singleTau.resize(total, 0);
1202                     seqNumber.resize(total, 0);
1203                     seqIndex.resize(total, 0);
1204                     
1205                     singleTau[oldTotal] = newTau[j];
1206                     
1207                     aaP[j][nSeqsPerOTU[j]] = oldTotal;
1208                     aaI[j][nSeqsPerOTU[j]] = i;
1209                     nSeqsPerOTU[j]++;
1210                 }
1211             }
1212             
1213         }
1214         
1215     }
1216     catch(exception& e) {
1217         m->errorOut(e, "ShhherCommand", "calcNewDistancesParent");
1218         exit(1);        
1219     }           
1220 }
1221
1222 /**************************************************************************************************/
1223
1224 void ShhherCommand::setOTUs(){
1225     
1226     try {
1227         vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
1228         
1229         for(int i=0;i<numOTUs;i++){
1230             
1231             if (m->control_pressed) { break; }
1232             
1233             for(int j=0;j<nSeqsPerOTU[i];j++){
1234                 int index = cumNumSeqs[i] + j;
1235                 double tauValue = singleTau[seqNumber[index]];
1236                 int sIndex = seqIndex[index];
1237                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
1238             }
1239         }
1240         
1241         for(int i=0;i<numSeqs;i++){
1242             double maxTau = -1.0000;
1243             int maxOTU = -1;
1244             for(int j=0;j<numOTUs;j++){
1245                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
1246                     maxTau = bigTauMatrix[i * numOTUs + j];
1247                     maxOTU = j;
1248                 }
1249             }
1250             
1251             otuData[i] = maxOTU;
1252         }
1253         
1254         nSeqsPerOTU.assign(numOTUs, 0);         
1255         
1256         for(int i=0;i<numSeqs;i++){
1257             int index = otuData[i];
1258             
1259             singleTau[i] = 1.0000;
1260             dist[i] = 0.0000;
1261             
1262             aaP[index][nSeqsPerOTU[index]] = i;
1263             aaI[index][nSeqsPerOTU[index]] = i;
1264             
1265             nSeqsPerOTU[index]++;
1266         }
1267         fill(); 
1268     }
1269     catch(exception& e) {
1270         m->errorOut(e, "ShhherCommand", "setOTUs");
1271         exit(1);        
1272     }           
1273 }
1274
1275 /**************************************************************************************************/
1276  
1277 void ShhherCommand::getUniques(){
1278     try{
1279         
1280         
1281         numUniques = 0;
1282         uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
1283         uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
1284         uniqueLengths.assign(numSeqs, 0);
1285         mapSeqToUnique.assign(numSeqs, -1);
1286         mapUniqueToSeq.assign(numSeqs, -1);
1287         
1288         vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
1289         
1290         for(int i=0;i<numSeqs;i++){
1291             
1292             if (m->control_pressed) { break; }
1293             
1294             int index = 0;
1295             
1296             vector<short> current(numFlowCells);
1297             for(int j=0;j<numFlowCells;j++){
1298                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
1299             }
1300             
1301             for(int j=0;j<numUniques;j++){
1302                 int offset = j * numFlowCells;
1303                 bool toEnd = 1;
1304                 
1305                 int shorterLength;
1306                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
1307                 else                                                            {       shorterLength = uniqueLengths[j];       }
1308                 
1309                 for(int k=0;k<shorterLength;k++){
1310                     if(current[k] != uniqueFlowgrams[offset + k]){
1311                         toEnd = 0;
1312                         break;
1313                     }
1314                 }
1315                 
1316                 if(toEnd){
1317                     mapSeqToUnique[i] = j;
1318                     uniqueCount[j]++;
1319                     index = j;
1320                     if(lengths[i] > uniqueLengths[j])   {       uniqueLengths[j] = lengths[i];  }
1321                     break;
1322                 }
1323                 index++;
1324             }
1325             
1326             if(index == numUniques){
1327                 uniqueLengths[numUniques] = lengths[i];
1328                 uniqueCount[numUniques] = 1;
1329                 mapSeqToUnique[i] = numUniques;//anMap
1330                 mapUniqueToSeq[numUniques] = i;//anF
1331                 
1332                 for(int k=0;k<numFlowCells;k++){
1333                     uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
1334                     uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
1335                 }
1336                 
1337                 numUniques++;
1338             }
1339         }
1340         uniqueFlowDataIntI.resize(numFlowCells * numUniques);
1341         uniqueLengths.resize(numUniques);       
1342         
1343         flowDataPrI.resize(numSeqs * numFlowCells, 0);
1344         for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
1345     }
1346     catch(exception& e) {
1347         m->errorOut(e, "ShhherCommand", "getUniques");
1348         exit(1);
1349     }
1350 }
1351
1352 /**************************************************************************************************/
1353
1354 float ShhherCommand::calcPairwiseDist(int seqA, int seqB){
1355     try{
1356         int minLength = lengths[mapSeqToUnique[seqA]];
1357         if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
1358         
1359         int ANumFlowCells = seqA * numFlowCells;
1360         int BNumFlowCells = seqB * numFlowCells;
1361         
1362         float dist = 0;
1363         
1364         for(int i=0;i<minLength;i++){
1365             
1366             if (m->control_pressed) { break; }
1367             
1368             int flowAIntI = flowDataIntI[ANumFlowCells + i];
1369             float flowAPrI = flowDataPrI[ANumFlowCells + i];
1370             
1371             int flowBIntI = flowDataIntI[BNumFlowCells + i];
1372             float flowBPrI = flowDataPrI[BNumFlowCells + i];
1373             dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
1374         }
1375         
1376         dist /= (float) minLength;
1377         return dist;
1378     }
1379     catch(exception& e) {
1380         m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
1381         exit(1);
1382     }
1383 }
1384
1385 //**********************************************************************************************************************/
1386
1387 string ShhherCommand::cluster(string distFileName, string namesFileName){
1388     try {
1389         
1390         ReadMatrix* read = new ReadColumnMatrix(distFileName);  
1391                 read->setCutoff(cutoff);
1392                 
1393                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
1394                 clusterNameMap->readMap();
1395                 read->read(clusterNameMap);
1396         
1397                 ListVector* list = read->getListVector();
1398                 SparseDistanceMatrix* matrix = read->getDMatrix();
1399                 
1400                 delete read; 
1401                 delete clusterNameMap; 
1402         
1403         RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
1404         
1405         Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
1406         string tag = cluster->getTag();
1407         
1408         double clusterCutoff = cutoff;
1409         while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
1410             
1411             if (m->control_pressed) { break; }
1412             
1413             cluster->update(clusterCutoff);
1414         }
1415         
1416         list->setLabel(toString(cutoff));
1417         
1418         string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
1419         ofstream listFile;
1420         m->openOutputFile(listFileName, listFile);
1421         list->print(listFile);
1422         listFile.close();
1423         
1424         delete matrix;  delete cluster; delete rabund; delete list;
1425         
1426         return listFileName;
1427     }
1428     catch(exception& e) {
1429         m->errorOut(e, "ShhherCommand", "cluster");
1430         exit(1);        
1431     }           
1432 }
1433
1434 /**************************************************************************************************/
1435
1436 void ShhherCommand::calcCentroidsDriver(int start, int finish){                          
1437     
1438     //this function gets the most likely homopolymer length at a flow position for a group of sequences
1439     //within an otu
1440     
1441     try{
1442         
1443         for(int i=start;i<finish;i++){
1444             
1445             if (m->control_pressed) { break; }
1446             
1447             double count = 0;
1448             int position = 0;
1449             int minFlowGram = 100000000;
1450             double minFlowValue = 1e8;
1451             change[i] = 0; //FALSE
1452             
1453             for(int j=0;j<nSeqsPerOTU[i];j++){
1454                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
1455             }
1456             
1457             if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
1458                 vector<double> adF(nSeqsPerOTU[i]);
1459                 vector<int> anL(nSeqsPerOTU[i]);
1460                 
1461                 for(int j=0;j<nSeqsPerOTU[i];j++){
1462                     int index = cumNumSeqs[i] + j;
1463                     int nI = seqIndex[index];
1464                     int nIU = mapSeqToUnique[nI];
1465                     
1466                     int k;
1467                     for(k=0;k<position;k++){
1468                         if(nIU == anL[k]){
1469                             break;
1470                         }
1471                     }
1472                     if(k == position){
1473                         anL[position] = nIU;
1474                         adF[position] = 0.0000;
1475                         position++;
1476                     }                                           
1477                 }
1478                 
1479                 for(int j=0;j<nSeqsPerOTU[i];j++){
1480                     int index = cumNumSeqs[i] + j;
1481                     int nI = seqIndex[index];
1482                     
1483                     double tauValue = singleTau[seqNumber[index]];
1484                     
1485                     for(int k=0;k<position;k++){
1486                         double dist = getDistToCentroid(anL[k], nI, lengths[nI]);
1487                         adF[k] += dist * tauValue;
1488                     }
1489                 }
1490                 
1491                 for(int j=0;j<position;j++){
1492                     if(adF[j] < minFlowValue){
1493                         minFlowGram = j;
1494                         minFlowValue = adF[j];
1495                     }
1496                 }
1497                 
1498                 if(centroids[i] != anL[minFlowGram]){
1499                     change[i] = 1;
1500                     centroids[i] = anL[minFlowGram];
1501                 }
1502             }
1503             else if(centroids[i] != -1){
1504                 change[i] = 1;
1505                 centroids[i] = -1;                      
1506             }
1507         }
1508     }
1509     catch(exception& e) {
1510         m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
1511         exit(1);        
1512     }           
1513 }
1514
1515 /**************************************************************************************************/
1516
1517 double ShhherCommand::getDistToCentroid(int cent, int flow, int length){
1518     try{
1519         
1520         int flowAValue = cent * numFlowCells;
1521         int flowBValue = flow * numFlowCells;
1522         
1523         double dist = 0;
1524         
1525         for(int i=0;i<length;i++){
1526             dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
1527             flowAValue++;
1528             flowBValue++;
1529         }
1530         
1531         return dist / (double)length;
1532     }
1533     catch(exception& e) {
1534         m->errorOut(e, "ShhherCommand", "getDistToCentroid");
1535         exit(1);        
1536     }           
1537 }
1538
1539 /**************************************************************************************************/
1540
1541 double ShhherCommand::getNewWeights(){
1542     try{
1543         
1544         double maxChange = 0;
1545         
1546         for(int i=0;i<numOTUs;i++){
1547             
1548             if (m->control_pressed) { break; }
1549             
1550             double difference = weight[i];
1551             weight[i] = 0;
1552             
1553             for(int j=0;j<nSeqsPerOTU[i];j++){
1554                 int index = cumNumSeqs[i] + j;
1555                 double tauValue = singleTau[seqNumber[index]];
1556                 weight[i] += tauValue;
1557             }
1558             
1559             difference = fabs(weight[i] - difference);
1560             if(difference > maxChange){ maxChange = difference; }
1561         }
1562         return maxChange;
1563     }
1564     catch(exception& e) {
1565         m->errorOut(e, "ShhherCommand", "getNewWeights");
1566         exit(1);        
1567     }           
1568 }
1569  
1570  /**************************************************************************************************/
1571  
1572 double ShhherCommand::getLikelihood(){
1573     
1574     try{
1575         
1576         vector<long double> P(numSeqs, 0);
1577         int effNumOTUs = 0;
1578         
1579         for(int i=0;i<numOTUs;i++){
1580             if(weight[i] > MIN_WEIGHT){
1581                 effNumOTUs++;
1582             }
1583         }
1584         
1585         string hold;
1586         for(int i=0;i<numOTUs;i++){
1587             
1588             if (m->control_pressed) { break; }
1589             
1590             for(int j=0;j<nSeqsPerOTU[i];j++){
1591                 int index = cumNumSeqs[i] + j;
1592                 int nI = seqIndex[index];
1593                 double singleDist = dist[seqNumber[index]];
1594                 
1595                 P[nI] += weight[i] * exp(-singleDist * sigma);
1596             }
1597         }
1598         double nLL = 0.00;
1599         for(int i=0;i<numSeqs;i++){
1600             if(P[i] == 0){      P[i] = DBL_EPSILON;     }
1601             
1602             nLL += -log(P[i]);
1603         }
1604         
1605         nLL = nLL -(double)numSeqs * log(sigma);
1606         
1607         return nLL; 
1608     }
1609     catch(exception& e) {
1610         m->errorOut(e, "ShhherCommand", "getNewWeights");
1611         exit(1);        
1612     }           
1613 }
1614
1615 /**************************************************************************************************/
1616
1617 void ShhherCommand::checkCentroids(){
1618     try{
1619         vector<int> unique(numOTUs, 1);
1620         
1621         for(int i=0;i<numOTUs;i++){
1622             if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
1623                 unique[i] = -1;
1624             }
1625         }
1626         
1627         for(int i=0;i<numOTUs;i++){
1628             
1629             if (m->control_pressed) { break; }
1630             
1631             if(unique[i] == 1){
1632                 for(int j=i+1;j<numOTUs;j++){
1633                     if(unique[j] == 1){
1634                         
1635                         if(centroids[j] == centroids[i]){
1636                             unique[j] = 0;
1637                             centroids[j] = -1;
1638                             
1639                             weight[i] += weight[j];
1640                             weight[j] = 0.0;
1641                         }
1642                     }
1643                 }
1644             }
1645         }
1646     }
1647     catch(exception& e) {
1648         m->errorOut(e, "ShhherCommand", "checkCentroids");
1649         exit(1);        
1650     }           
1651 }
1652  /**************************************************************************************************/
1653
1654
1655  
1656 void ShhherCommand::writeQualities(vector<int> otuCounts){
1657     
1658     try {
1659         string thisOutputDir = outputDir;
1660         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1661         map<string, string> variables; 
1662                 variables["[filename]"] = outputDir + m->getRootName(m->getSimpleName(flowFileName));
1663         string qualityFileName = getOutputFileName("qfile",variables);
1664         
1665         ofstream qualityFile;
1666         m->openOutputFile(qualityFileName, qualityFile);
1667         
1668         qualityFile.setf(ios::fixed, ios::floatfield);
1669         qualityFile.setf(ios::showpoint);
1670         qualityFile << setprecision(6);
1671         
1672         vector<vector<int> > qualities(numOTUs);
1673         vector<double> pr(HOMOPS, 0);
1674         
1675         
1676         for(int i=0;i<numOTUs;i++){
1677             
1678             if (m->control_pressed) { break; }
1679             
1680             int index = 0;
1681             int base = 0;
1682             
1683             if(nSeqsPerOTU[i] > 0){
1684                 qualities[i].assign(1024, -1);
1685                 
1686                 while(index < numFlowCells){
1687                     double maxPrValue = 1e8;
1688                     short maxPrIndex = -1;
1689                     double count = 0.0000;
1690                     
1691                     pr.assign(HOMOPS, 0);
1692                     
1693                     for(int j=0;j<nSeqsPerOTU[i];j++){
1694                         int lIndex = cumNumSeqs[i] + j;
1695                         double tauValue = singleTau[seqNumber[lIndex]];
1696                         int sequenceIndex = aaI[i][j];
1697                         short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
1698                         
1699                         count += tauValue;
1700                         
1701                         for(int s=0;s<HOMOPS;s++){
1702                             pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
1703                         }
1704                     }
1705                     
1706                     maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
1707                     maxPrValue = pr[maxPrIndex];
1708                     
1709                     if(count > MIN_COUNT){
1710                         double U = 0.0000;
1711                         double norm = 0.0000;
1712                         
1713                         for(int s=0;s<HOMOPS;s++){
1714                             norm += exp(-(pr[s] - maxPrValue));
1715                         }
1716                         
1717                         for(int s=1;s<=maxPrIndex;s++){
1718                             int value = 0;
1719                             double temp = 0.0000;
1720                             
1721                             U += exp(-(pr[s-1]-maxPrValue))/norm;
1722                             
1723                             if(U>0.00){
1724                                 temp = log10(U);
1725                             }
1726                             else{
1727                                 temp = -10.1;
1728                             }
1729                             temp = floor(-10 * temp);
1730                             value = (int)floor(temp);
1731                             if(value > 100){    value = 100;    }
1732                             
1733                             qualities[i][base] = (int)value;
1734                             base++;
1735                         }
1736                     }
1737                     
1738                     index++;
1739                 }
1740             }
1741             
1742             
1743             if(otuCounts[i] > 0){
1744                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
1745                 
1746                 int j=4;        //need to get past the first four bases
1747                 while(qualities[i][j] != -1){
1748                     qualityFile << qualities[i][j] << ' ';
1749                     j++;
1750                 }
1751                 qualityFile << endl;
1752             }
1753         }
1754         qualityFile.close();
1755         outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
1756         
1757     }
1758     catch(exception& e) {
1759         m->errorOut(e, "ShhherCommand", "writeQualities");
1760         exit(1);        
1761     }           
1762 }
1763
1764 /**************************************************************************************************/
1765
1766 void ShhherCommand::writeSequences(vector<int> otuCounts){
1767     try {
1768         string thisOutputDir = outputDir;
1769         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1770         map<string, string> variables; 
1771                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1772         string fastaFileName = getOutputFileName("fasta",variables);
1773         ofstream fastaFile;
1774         m->openOutputFile(fastaFileName, fastaFile);
1775         
1776         vector<string> names(numOTUs, "");
1777         
1778         for(int i=0;i<numOTUs;i++){
1779             
1780             if (m->control_pressed) { break; }
1781             
1782             int index = centroids[i];
1783             
1784             if(otuCounts[i] > 0){
1785                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
1786                 
1787                 string newSeq = "";
1788                 
1789                 for(int j=0;j<numFlowCells;j++){
1790                     
1791                     char base = flowOrder[j % flowOrder.length()];
1792                     for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
1793                         newSeq += base;
1794                     }
1795                 }
1796                 
1797                 fastaFile << newSeq.substr(4) << endl;
1798             }
1799         }
1800         fastaFile.close();
1801         
1802         outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
1803         
1804         if(compositeFASTAFileName != ""){
1805             m->appendFiles(fastaFileName, compositeFASTAFileName);
1806         }
1807     }
1808     catch(exception& e) {
1809         m->errorOut(e, "ShhherCommand", "writeSequences");
1810         exit(1);        
1811     }           
1812 }
1813
1814 /**************************************************************************************************/
1815
1816 void ShhherCommand::writeNames(vector<int> otuCounts){
1817     try {
1818         string thisOutputDir = outputDir;
1819         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1820         map<string, string> variables; 
1821                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1822         string nameFileName = getOutputFileName("name",variables);
1823         ofstream nameFile;
1824         m->openOutputFile(nameFileName, nameFile);
1825         
1826         for(int i=0;i<numOTUs;i++){
1827             
1828             if (m->control_pressed) { break; }
1829             
1830             if(otuCounts[i] > 0){
1831                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
1832                 
1833                 for(int j=1;j<nSeqsPerOTU[i];j++){
1834                     nameFile << ',' << seqNameVector[aaI[i][j]];
1835                 }
1836                 
1837                 nameFile << endl;
1838             }
1839         }
1840         nameFile.close();
1841         outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
1842         
1843         
1844         if(compositeNamesFileName != ""){
1845             m->appendFiles(nameFileName, compositeNamesFileName);
1846         }               
1847     }
1848     catch(exception& e) {
1849         m->errorOut(e, "ShhherCommand", "writeNames");
1850         exit(1);        
1851     }           
1852 }
1853
1854 /**************************************************************************************************/
1855
1856 void ShhherCommand::writeGroups(){
1857     try {
1858         string thisOutputDir = outputDir;
1859         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1860         string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
1861         int pos = fileRoot.find_first_of('.');
1862         string fileGroup = fileRoot;
1863         if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
1864         map<string, string> variables; 
1865                 variables["[filename]"] = thisOutputDir + fileRoot;
1866         string groupFileName = getOutputFileName("group",variables);
1867         ofstream groupFile;
1868         m->openOutputFile(groupFileName, groupFile);
1869         
1870         for(int i=0;i<numSeqs;i++){
1871             if (m->control_pressed) { break; }
1872             groupFile << seqNameVector[i] << '\t' << fileGroup << endl;
1873         }
1874         groupFile.close();
1875         outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
1876         
1877     }
1878     catch(exception& e) {
1879         m->errorOut(e, "ShhherCommand", "writeGroups");
1880         exit(1);        
1881     }           
1882 }
1883
1884 /**************************************************************************************************/
1885
1886 void ShhherCommand::writeClusters(vector<int> otuCounts){
1887     try {
1888         string thisOutputDir = outputDir;
1889         if (outputDir == "") {  thisOutputDir += m->hasPath(flowFileName);  }
1890         map<string, string> variables; 
1891                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
1892         string otuCountsFileName = getOutputFileName("counts",variables);
1893         ofstream otuCountsFile;
1894         m->openOutputFile(otuCountsFileName, otuCountsFile);
1895         
1896         string bases = flowOrder;
1897         
1898         for(int i=0;i<numOTUs;i++){
1899             
1900             if (m->control_pressed) {
1901                 break;
1902             }
1903             //output the translated version of the centroid sequence for the otu
1904             if(otuCounts[i] > 0){
1905                 int index = centroids[i];
1906                 
1907                 otuCountsFile << "ideal\t";
1908                 for(int j=8;j<numFlowCells;j++){
1909                     char base = bases[j % bases.length()];
1910                     for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
1911                         otuCountsFile << base;
1912                     }
1913                 }
1914                 otuCountsFile << endl;
1915                 
1916                 for(int j=0;j<nSeqsPerOTU[i];j++){
1917                     int sequence = aaI[i][j];
1918                     otuCountsFile << seqNameVector[sequence] << '\t';
1919                     
1920                     string newSeq = "";
1921                     
1922                     for(int k=0;k<lengths[sequence];k++){
1923                         char base = bases[k % bases.length()];
1924                         int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
1925                         
1926                         for(int s=0;s<freq;s++){
1927                             newSeq += base;
1928                             //otuCountsFile << base;
1929                         }
1930                     }
1931                     otuCountsFile << newSeq.substr(4) << endl;
1932                 }
1933                 otuCountsFile << endl;
1934             }
1935         }
1936         otuCountsFile.close();
1937         outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
1938         
1939     }
1940     catch(exception& e) {
1941         m->errorOut(e, "ShhherCommand", "writeClusters");
1942         exit(1);        
1943     }           
1944 }
1945
1946 #else
1947 //**********************************************************************************************************************
1948
1949 int ShhherCommand::execute(){
1950         try {
1951                 if (abort == true) { if (calledHelp) { return 0; }  return 2;   }
1952                 
1953                 getSingleLookUp();      if (m->control_pressed) { return 0; }
1954                 getJointLookUp();       if (m->control_pressed) { return 0; }
1955                 
1956         int numFiles = flowFileVector.size();
1957                 
1958         if (numFiles < processors) { processors = numFiles; }
1959         
1960 #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)
1961         if (processors == 1) { driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName); }
1962         else { createProcesses(flowFileVector); } //each processor processes one file
1963 #else
1964         driver(flowFileVector, compositeFASTAFileName, compositeNamesFileName);
1965 #endif
1966         
1967                 if(compositeFASTAFileName != ""){
1968                         outputNames.push_back(compositeFASTAFileName); outputTypes["fasta"].push_back(compositeFASTAFileName);
1969                         outputNames.push_back(compositeNamesFileName); outputTypes["name"].push_back(compositeNamesFileName);
1970                 }
1971
1972                 m->mothurOutEndLine();
1973                 m->mothurOut("Output File Names: "); m->mothurOutEndLine();
1974                 for (int i = 0; i < outputNames.size(); i++) {  m->mothurOut(outputNames[i]); m->mothurOutEndLine();    }
1975                 m->mothurOutEndLine();
1976                 
1977                 return 0;
1978         }
1979         catch(exception& e) {
1980                 m->errorOut(e, "ShhherCommand", "execute");
1981                 exit(1);
1982         }
1983 }
1984 #endif
1985 //********************************************************************************************************************
1986 //sorts biggest to smallest
1987 inline bool compareFileSizes(string left, string right){
1988     
1989     FILE * pFile;
1990     long leftsize = 0;
1991     
1992     //get num bytes in file
1993     string filename = left;
1994     pFile = fopen (filename.c_str(),"rb");
1995     string error = "Error opening " + filename;
1996     if (pFile==NULL) perror (error.c_str());
1997     else{
1998         fseek (pFile, 0, SEEK_END);
1999         leftsize=ftell (pFile);
2000         fclose (pFile);
2001     }
2002     
2003     FILE * pFile2;
2004     long rightsize = 0;
2005     
2006     //get num bytes in file
2007     filename = right;
2008     pFile2 = fopen (filename.c_str(),"rb");
2009     error = "Error opening " + filename;
2010     if (pFile2==NULL) perror (error.c_str());
2011     else{
2012         fseek (pFile2, 0, SEEK_END);
2013         rightsize=ftell (pFile2);
2014         fclose (pFile2);
2015     }
2016     
2017     return (leftsize > rightsize);      
2018
2019 /**************************************************************************************************/
2020
2021 int ShhherCommand::createProcesses(vector<string> filenames){
2022     try {
2023         vector<int> processIDS;
2024                 int process = 1;
2025                 int num = 0;
2026                 
2027                 //sanity check
2028                 if (filenames.size() < processors) { processors = filenames.size(); }
2029         
2030         //sort file names by size to divide load better
2031         sort(filenames.begin(), filenames.end(), compareFileSizes);
2032         
2033         vector < vector <string> > dividedFiles; //dividedFiles[1] = vector of filenames for process 1...
2034         dividedFiles.resize(processors);
2035         
2036         //for each file, figure out which process will complete it
2037         //want to divide the load intelligently so the big files are spread between processes
2038         for (int i = 0; i < filenames.size(); i++) { 
2039             int processToAssign = (i+1) % processors; 
2040             if (processToAssign == 0) { processToAssign = processors; }
2041             
2042             dividedFiles[(processToAssign-1)].push_back(filenames[i]);
2043         }
2044         
2045         //now lets reverse the order of ever other process, so we balance big files running with little ones
2046         for (int i = 0; i < processors; i++) {
2047             int remainder = ((i+1) % processors);
2048             if (remainder) {  reverse(dividedFiles[i].begin(), dividedFiles[i].end());  }
2049         }
2050         
2051                         
2052                 //divide the groups between the processors
2053                 /*vector<linePair> lines;
2054         vector<int> numFilesToComplete;
2055                 int numFilesPerProcessor = filenames.size() / processors;
2056                 for (int i = 0; i < processors; i++) {
2057                         int startIndex =  i * numFilesPerProcessor;
2058                         int endIndex = (i+1) * numFilesPerProcessor;
2059                         if(i == (processors - 1)){      endIndex = filenames.size();    }
2060                         lines.push_back(linePair(startIndex, endIndex));
2061             numFilesToComplete.push_back((endIndex-startIndex));
2062                 }*/
2063                 
2064         #if defined (__APPLE__) || (__MACH__) || (linux) || (__linux) || (__linux__) || (__unix__) || (__unix)          
2065                 
2066                 //loop through and create all the processes you want
2067                 while (process != processors) {
2068                         int pid = fork();
2069                         
2070                         if (pid > 0) {
2071                                 processIDS.push_back(pid);  //create map from line number to pid so you can append files in correct order later
2072                                 process++;
2073                         }else if (pid == 0){
2074                                 num = driver(dividedFiles[process], compositeFASTAFileName + toString(getpid()) + ".temp", compositeNamesFileName  + toString(getpid()) + ".temp");
2075                 
2076                 //pass numSeqs to parent
2077                                 ofstream out;
2078                                 string tempFile = compositeFASTAFileName + toString(getpid()) + ".num.temp";
2079                                 m->openOutputFile(tempFile, out);
2080                                 out << num << endl;
2081                                 out.close();
2082                 
2083                                 exit(0);
2084                         }else { 
2085                                 m->mothurOut("[ERROR]: unable to spawn the necessary processes."); m->mothurOutEndLine(); 
2086                                 for (int i = 0; i < processIDS.size(); i++) { kill (processIDS[i], SIGINT); }
2087                                 exit(0);
2088                         }
2089                 }
2090                 
2091                 //do my part
2092                 driver(dividedFiles[0], compositeFASTAFileName, compositeNamesFileName);
2093                 
2094                 //force parent to wait until all the processes are done
2095                 for (int i=0;i<processIDS.size();i++) { 
2096                         int temp = processIDS[i];
2097                         wait(&temp);
2098                 }
2099         
2100         #else
2101         
2102         //////////////////////////////////////////////////////////////////////////////////////////////////////
2103         
2104         /////////////////////// NOT WORKING, ACCESS VIOLATION ON READ OF FLOWGRAMS IN THREAD /////////////////
2105         
2106         //////////////////////////////////////////////////////////////////////////////////////////////////////
2107                 //Windows version shared memory, so be careful when passing variables through the shhhFlowsData struct. 
2108                 //Above fork() will clone, so memory is separate, but that's not the case with windows, 
2109                 //////////////////////////////////////////////////////////////////////////////////////////////////////
2110                 /*
2111                 vector<shhhFlowsData*> pDataArray; 
2112                 DWORD   dwThreadIdArray[processors-1];
2113                 HANDLE  hThreadArray[processors-1]; 
2114                 
2115                 //Create processor worker threads.
2116                 for( int i=0; i<processors-1; i++ ){
2117                         // Allocate memory for thread data.
2118                         string extension = "";
2119                         if (i != 0) { extension = toString(i) + ".temp"; }
2120                         
2121             shhhFlowsData* tempFlow = new shhhFlowsData(filenames, (compositeFASTAFileName + extension), (compositeNamesFileName + extension), outputDir, flowOrder, jointLookUp, singleLookUp, m, lines[i].start, lines[i].end, cutoff, sigma, minDelta, maxIters, i);
2122                         pDataArray.push_back(tempFlow);
2123                         processIDS.push_back(i);
2124             
2125                         hThreadArray[i] = CreateThread(NULL, 0, ShhhFlowsThreadFunction, pDataArray[i], 0, &dwThreadIdArray[i]);   
2126                 }
2127                 
2128                 //using the main process as a worker saves time and memory
2129                 //do my part
2130                 driver(filenames, compositeFASTAFileName, compositeNamesFileName, lines[processors-1].start, lines[processors-1].end);
2131                 
2132                 //Wait until all threads have terminated.
2133                 WaitForMultipleObjects(processors-1, hThreadArray, TRUE, INFINITE);
2134                 
2135                 //Close all thread handles and free memory allocations.
2136                 for(int i=0; i < pDataArray.size(); i++){
2137                         for(int j=0; j < pDataArray[i]->outputNames.size(); j++){ outputNames.push_back(pDataArray[i]->outputNames[j]); }
2138                         CloseHandle(hThreadArray[i]);
2139                         delete pDataArray[i];
2140                 }
2141                 */
2142         #endif
2143         
2144         for (int i=0;i<processIDS.size();i++) { 
2145             ifstream in;
2146                         string tempFile =  compositeFASTAFileName + toString(processIDS[i]) + ".num.temp";
2147                         m->openInputFile(tempFile, in);
2148                         if (!in.eof()) { 
2149                 int tempNum = 0; 
2150                 in >> tempNum; 
2151                 if (tempNum != dividedFiles[i+1].size()) {
2152                     m->mothurOut("[ERROR]: main process expected " + toString(processIDS[i]) + " to complete " + toString(dividedFiles[i+1].size()) + " files, and it only reported completing " + toString(tempNum) + ". This will cause file mismatches.  The flow files may be too large to process with multiple processors. \n");
2153                 }
2154             }
2155                         in.close(); m->mothurRemove(tempFile);
2156             
2157             if (compositeFASTAFileName != "") {
2158                 m->appendFiles((compositeFASTAFileName + toString(processIDS[i]) + ".temp"), compositeFASTAFileName);
2159                 m->appendFiles((compositeNamesFileName + toString(processIDS[i]) + ".temp"), compositeNamesFileName);
2160                 m->mothurRemove((compositeFASTAFileName + toString(processIDS[i]) + ".temp"));
2161                 m->mothurRemove((compositeNamesFileName + toString(processIDS[i]) + ".temp"));
2162             }
2163         }
2164         
2165         return 0;
2166         
2167     }
2168         catch(exception& e) {
2169                 m->errorOut(e, "ShhherCommand", "createProcesses");
2170                 exit(1);
2171         }
2172 }
2173 /**************************************************************************************************/
2174
2175 vector<string> ShhherCommand::parseFlowFiles(string filename){
2176     try {
2177         vector<string> files;
2178         int count = 0;
2179         
2180         ifstream in;
2181         m->openInputFile(filename, in);
2182         
2183         int thisNumFLows = 0;
2184         in >> thisNumFLows; m->gobble(in);
2185         
2186         while (!in.eof()) {
2187             if (m->control_pressed) { break; }
2188             
2189             ofstream out;
2190             string outputFileName = filename + toString(count) + ".temp";
2191             m->openOutputFile(outputFileName, out);
2192             out << thisNumFLows << endl;
2193             files.push_back(outputFileName);
2194             
2195             int numLinesWrote = 0;
2196             for (int i = 0; i < largeSize; i++) {
2197                 if (in.eof()) { break; }
2198                 string line = m->getline(in); m->gobble(in);
2199                 out << line << endl;
2200                 numLinesWrote++;
2201             }
2202             out.close();
2203             
2204             if (numLinesWrote == 0) {  m->mothurRemove(outputFileName); files.pop_back();  }
2205             count++;
2206         }
2207         in.close();
2208         
2209         if (m->control_pressed) { for (int i = 0; i < files.size(); i++) { m->mothurRemove(files[i]); }  files.clear(); }
2210         
2211         m->mothurOut("\nDivided " + filename + " into " + toString(files.size()) + " files.\n\n"); 
2212         
2213         return files;
2214     }
2215         catch(exception& e) {
2216                 m->errorOut(e, "ShhherCommand", "parseFlowFiles");
2217                 exit(1);
2218         }
2219 }
2220 /**************************************************************************************************/
2221
2222 int ShhherCommand::driver(vector<string> filenames, string thisCompositeFASTAFileName, string thisCompositeNamesFileName){
2223     try {
2224         
2225         int numCompleted = 0;
2226         
2227         for(int i=0;i<filenames.size();i++){
2228                         
2229                         if (m->control_pressed) { break; }
2230                         
2231             vector<string> theseFlowFileNames; theseFlowFileNames.push_back(filenames[i]);
2232             if (large) {  theseFlowFileNames = parseFlowFiles(filenames[i]);  }
2233             
2234             if (m->control_pressed) { break; }
2235             
2236             double begClock = clock();
2237             unsigned long long begTime;
2238             
2239             string fileNameForOutput = filenames[i];
2240             
2241             for (int g = 0; g < theseFlowFileNames.size(); g++) {
2242                 
2243                 string flowFileName = theseFlowFileNames[g];
2244                 m->mothurOut("\n>>>>>\tProcessing " + flowFileName + " (file " + toString(i+1) + " of " + toString(filenames.size()) + ")\t<<<<<\n");
2245                 m->mothurOut("Reading flowgrams...\n");
2246                 
2247                 vector<string> seqNameVector;
2248                 vector<int> lengths;
2249                 vector<short> flowDataIntI;
2250                 vector<double> flowDataPrI;
2251                 map<string, int> nameMap;
2252                 vector<short> uniqueFlowgrams;
2253                 vector<int> uniqueCount;
2254                 vector<int> mapSeqToUnique;
2255                 vector<int> mapUniqueToSeq;
2256                 vector<int> uniqueLengths;
2257                 int numFlowCells;
2258                 
2259                 if (m->debug) { m->mothurOut("[DEBUG]: About to read flowgrams.\n"); }
2260                 int numSeqs = getFlowData(flowFileName, seqNameVector, lengths, flowDataIntI, nameMap, numFlowCells);
2261                 
2262                 if (m->control_pressed) { break; }
2263                 
2264                 m->mothurOut("Identifying unique flowgrams...\n");
2265                 int numUniques = getUniques(numSeqs, numFlowCells, uniqueFlowgrams, uniqueCount, uniqueLengths, mapSeqToUnique, mapUniqueToSeq, lengths, flowDataPrI, flowDataIntI);
2266                 
2267                 if (m->control_pressed) { break; }
2268                 
2269                 m->mothurOut("Calculating distances between flowgrams...\n");
2270                 string distFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.dist";
2271                 begTime = time(NULL);
2272                
2273                 
2274                 flowDistParentFork(numFlowCells, distFileName, numUniques, mapUniqueToSeq, mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2275                 
2276                 m->mothurOutEndLine();
2277                 m->mothurOut("Total time: " + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/CLOCKS_PER_SEC) + '\n');
2278                 
2279                 
2280                 string namesFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.names";
2281                 createNamesFile(numSeqs, numUniques, namesFileName, seqNameVector, mapSeqToUnique, mapUniqueToSeq);
2282                 
2283                 if (m->control_pressed) { break; }
2284                 
2285                 m->mothurOut("\nClustering flowgrams...\n");
2286                 string listFileName = flowFileName.substr(0,flowFileName.find_last_of('.')) + ".shhh.list";
2287                 cluster(listFileName, distFileName, namesFileName);
2288                 
2289                 if (m->control_pressed) { break; }
2290                 
2291                 vector<int> otuData;
2292                 vector<int> cumNumSeqs;
2293                 vector<int> nSeqsPerOTU;
2294                 vector<vector<int> > aaP;       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2295                 vector<vector<int> > aaI;       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2296                 vector<int> seqNumber;          //tMaster->anP:         the sequence id number sorted by OTU
2297                 vector<int> seqIndex;           //tMaster->anI;         the index that corresponds to seqNumber
2298                 
2299                 
2300                 int numOTUs = getOTUData(numSeqs, listFileName, otuData, cumNumSeqs, nSeqsPerOTU, aaP, aaI, seqNumber, seqIndex, nameMap);
2301                 
2302                 if (m->control_pressed) { break; }
2303                 
2304                 m->mothurRemove(distFileName);
2305                 m->mothurRemove(namesFileName);
2306                 m->mothurRemove(listFileName);
2307                 
2308                 vector<double> dist;            //adDist - distance of sequences to centroids
2309                 vector<short> change;           //did the centroid sequence change? 0 = no; 1 = yes
2310                 vector<int> centroids;          //the representative flowgram for each cluster m
2311                 vector<double> weight;
2312                 vector<double> singleTau;       //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2313                 vector<int> nSeqsBreaks;
2314                 vector<int> nOTUsBreaks;
2315                 
2316                 if (m->debug) { m->mothurOut("[DEBUG]: numSeqs = " + toString(numSeqs) + " numOTUS = " + toString(numOTUs) + " about to alloc a dist vector with size = " + toString((numSeqs * numOTUs)) + ".\n"); }
2317                 
2318                 dist.assign(numSeqs * numOTUs, 0);
2319                 change.assign(numOTUs, 1);
2320                 centroids.assign(numOTUs, -1);
2321                 weight.assign(numOTUs, 0);
2322                 singleTau.assign(numSeqs, 1.0);
2323                 
2324                 nSeqsBreaks.assign(2, 0);
2325                 nOTUsBreaks.assign(2, 0);
2326                 
2327                 nSeqsBreaks[0] = 0;
2328                 nSeqsBreaks[1] = numSeqs;
2329                 nOTUsBreaks[1] = numOTUs;
2330                 
2331                 if (m->debug) { m->mothurOut("[DEBUG]: done allocating memory, about to denoise.\n"); }
2332                 
2333                 if (m->control_pressed) { break; }
2334                 
2335                 double maxDelta = 0;
2336                 int iter = 0;
2337                 
2338                 begClock = clock();
2339                 begTime = time(NULL);
2340                 
2341                 m->mothurOut("\nDenoising flowgrams...\n");
2342                 m->mothurOut("iter\tmaxDelta\tnLL\t\tcycletime\n");
2343                 
2344                 while((maxIters == 0 && maxDelta > minDelta) || iter < MIN_ITER || (maxDelta > minDelta && iter < maxIters)){
2345                     
2346                     if (m->control_pressed) { break; }
2347                     
2348                     double cycClock = clock();
2349                     unsigned long long cycTime = time(NULL);
2350                     fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2351                     
2352                     if (m->control_pressed) { break; }
2353                     
2354                     calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber);
2355                     
2356                     if (m->control_pressed) { break; }
2357                     
2358                     maxDelta = getNewWeights(numOTUs, cumNumSeqs, nSeqsPerOTU, singleTau, seqNumber, weight);  
2359                     
2360                     if (m->control_pressed) { break; }
2361                     
2362                     double nLL = getLikelihood(numSeqs, numOTUs, nSeqsPerOTU, seqNumber, cumNumSeqs, seqIndex, dist, weight); 
2363                     
2364                     if (m->control_pressed) { break; }
2365                     
2366                     checkCentroids(numOTUs, centroids, weight);
2367                     
2368                     if (m->control_pressed) { break; }
2369                     
2370                     calcNewDistances(numSeqs, numOTUs, nSeqsPerOTU,  dist, weight, change, centroids, aaP, singleTau, aaI, seqNumber, seqIndex, uniqueFlowgrams, flowDataIntI, numFlowCells, lengths);
2371                     
2372                     if (m->control_pressed) { break; }
2373                     
2374                     iter++;
2375                     
2376                     m->mothurOut(toString(iter) + '\t' + toString(maxDelta) + '\t' + toString(nLL) + '\t' + toString(time(NULL) - cycTime) + '\t' + toString((clock() - cycClock)/(double)CLOCKS_PER_SEC) + '\n');
2377                     
2378                 }       
2379                 
2380                 if (m->control_pressed) { break; }
2381                 
2382                 m->mothurOut("\nFinalizing...\n");
2383                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);
2384                 
2385                 if (m->control_pressed) { break; }
2386                 
2387                 setOTUs(numOTUs, numSeqs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, otuData, singleTau, dist, aaP, aaI);
2388                 
2389                 if (m->control_pressed) { break; }
2390                 
2391                 vector<int> otuCounts(numOTUs, 0);
2392                 for(int j=0;j<numSeqs;j++)      {       otuCounts[otuData[j]]++;        }
2393                 
2394                 calcCentroidsDriver(numOTUs, cumNumSeqs, nSeqsPerOTU, seqIndex, change, centroids, singleTau, mapSeqToUnique, uniqueFlowgrams, flowDataIntI, lengths, numFlowCells, seqNumber); 
2395                 
2396                 if (m->control_pressed) { break; }
2397                 
2398                 if ((large) && (g == 0)) {  flowFileName = filenames[i]; theseFlowFileNames[0] = filenames[i]; }
2399                 string thisOutputDir = outputDir;
2400                 if (outputDir == "") {  thisOutputDir = m->hasPath(flowFileName);  }
2401                 map<string, string> variables; 
2402                 variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(flowFileName));
2403                 string qualityFileName = getOutputFileName("qfile",variables);
2404                 string fastaFileName = getOutputFileName("fasta",variables);
2405                 string nameFileName = getOutputFileName("name",variables);
2406                 string otuCountsFileName = getOutputFileName("counts",variables);
2407                 string fileRoot = m->getRootName(m->getSimpleName(flowFileName));
2408                 int pos = fileRoot.find_first_of('.');
2409                 string fileGroup = fileRoot;
2410                 if (pos != string::npos) {  fileGroup = fileRoot.substr(pos+1, (fileRoot.length()-1-(pos+1)));  }
2411                 string groupFileName = getOutputFileName("group",variables);
2412
2413                 
2414                 writeQualities(numOTUs, numFlowCells, qualityFileName, otuCounts, nSeqsPerOTU, seqNumber, singleTau, flowDataIntI, uniqueFlowgrams, cumNumSeqs, mapUniqueToSeq, seqNameVector, centroids, aaI); if (m->control_pressed) { break; }
2415                 writeSequences(thisCompositeFASTAFileName, numOTUs, numFlowCells, fastaFileName, otuCounts, uniqueFlowgrams, seqNameVector, aaI, centroids);if (m->control_pressed) { break; }
2416                 writeNames(thisCompositeNamesFileName, numOTUs, nameFileName, otuCounts, seqNameVector, aaI, nSeqsPerOTU);                              if (m->control_pressed) { break; }
2417                 writeClusters(otuCountsFileName, numOTUs, numFlowCells,otuCounts, centroids, uniqueFlowgrams, seqNameVector, aaI, nSeqsPerOTU, lengths, flowDataIntI);                  if (m->control_pressed) { break; }
2418                 writeGroups(groupFileName, fileGroup, numSeqs, seqNameVector);                                          if (m->control_pressed) { break; }
2419                 
2420                 if (large) {
2421                     if (g > 0) {
2422                         variables["[filename]"] = thisOutputDir + m->getRootName(m->getSimpleName(theseFlowFileNames[0]));
2423                         m->appendFiles(qualityFileName, getOutputFileName("qfile",variables));
2424                         m->mothurRemove(qualityFileName);
2425                         m->appendFiles(fastaFileName, getOutputFileName("fasta",variables));
2426                         m->mothurRemove(fastaFileName);
2427                         m->appendFiles(nameFileName, getOutputFileName("name",variables));
2428                         m->mothurRemove(nameFileName);
2429                         m->appendFiles(otuCountsFileName, getOutputFileName("counts",variables));
2430                         m->mothurRemove(otuCountsFileName);
2431                         m->appendFiles(groupFileName, getOutputFileName("group",variables));
2432                         m->mothurRemove(groupFileName);
2433                     }
2434                     m->mothurRemove(theseFlowFileNames[g]);
2435                 }
2436                         }
2437             
2438             numCompleted++;
2439                         m->mothurOut("Total time to process " + fileNameForOutput + ":\t" + toString(time(NULL) - begTime) + '\t' + toString((clock() - begClock)/(double)CLOCKS_PER_SEC) + '\n');
2440                 }
2441                 
2442         if (m->control_pressed) { for (int i = 0; i < outputNames.size(); i++) { m->mothurRemove(outputNames[i]); } return 0; }
2443         
2444         return numCompleted;
2445         
2446     }catch(exception& e) {
2447             m->errorOut(e, "ShhherCommand", "driver");
2448             exit(1);
2449     }
2450 }
2451
2452 /**************************************************************************************************/
2453 int ShhherCommand::getFlowData(string filename, vector<string>& thisSeqNameVector, vector<int>& thisLengths, vector<short>& thisFlowDataIntI, map<string, int>& thisNameMap, int& numFlowCells){
2454         try{
2455        
2456                 ifstream flowFile;
2457        
2458                 m->openInputFile(filename, flowFile);
2459                 
2460                 string seqName;
2461                 int currentNumFlowCells;
2462                 float intensity;
2463         thisSeqNameVector.clear();
2464                 thisLengths.clear();
2465                 thisFlowDataIntI.clear();
2466                 thisNameMap.clear();
2467                 
2468                 string numFlowTest;
2469         flowFile >> numFlowTest;
2470         
2471         if (!m->isContainingOnlyDigits(numFlowTest)) { m->mothurOut("[ERROR]: expected a number and got " + numFlowTest + ", quitting. Did you use the flow parameter instead of the file parameter?"); m->mothurOutEndLine(); exit(1); }
2472         else { convert(numFlowTest, numFlowCells); }
2473         
2474         if (m->debug) { m->mothurOut("[DEBUG]: numFlowCells = " + toString(numFlowCells) + ".\n"); }
2475                 int index = 0;//pcluster
2476                 while(!flowFile.eof()){
2477                         
2478                         if (m->control_pressed) { break; }
2479                         
2480                         flowFile >> seqName >> currentNumFlowCells;
2481             
2482                         thisLengths.push_back(currentNumFlowCells);
2483            
2484                         thisSeqNameVector.push_back(seqName);
2485                         thisNameMap[seqName] = index++;//pcluster
2486             
2487             if (m->debug) { m->mothurOut("[DEBUG]: seqName = " + seqName + " length = " + toString(currentNumFlowCells) + " index = " + toString(index) + "\n"); }
2488             
2489                         for(int i=0;i<numFlowCells;i++){
2490                                 flowFile >> intensity;
2491                                 if(intensity > 9.99)    {       intensity = 9.99;       }
2492                                 int intI = int(100 * intensity + 0.0001);
2493                                 thisFlowDataIntI.push_back(intI);
2494                         }
2495                         m->gobble(flowFile);
2496                 }
2497                 flowFile.close();
2498                 
2499                 int numSeqs = thisSeqNameVector.size();         
2500                 
2501                 for(int i=0;i<numSeqs;i++){
2502                         
2503                         if (m->control_pressed) { break; }
2504                         
2505                         int iNumFlowCells = i * numFlowCells;
2506                         for(int j=thisLengths[i];j<numFlowCells;j++){
2507                                 thisFlowDataIntI[iNumFlowCells + j] = 0;
2508                         }
2509                 }
2510         
2511         return numSeqs;
2512                 
2513         }
2514         catch(exception& e) {
2515                 m->errorOut(e, "ShhherCommand", "getFlowData");
2516                 exit(1);
2517         }
2518 }
2519 /**************************************************************************************************/
2520
2521 int ShhherCommand::flowDistParentFork(int numFlowCells, string distFileName, int stopSeq, vector<int>& mapUniqueToSeq, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2522         try{            
2523         
2524                 ostringstream outStream;
2525                 outStream.setf(ios::fixed, ios::floatfield);
2526                 outStream.setf(ios::dec, ios::basefield);
2527                 outStream.setf(ios::showpoint);
2528                 outStream.precision(6);
2529                 
2530                 int begTime = time(NULL);
2531                 double begClock = clock();
2532         
2533                 for(int i=0;i<stopSeq;i++){
2534                         
2535                         if (m->control_pressed) { break; }
2536                         
2537                         for(int j=0;j<i;j++){
2538                                 float flowDistance = calcPairwiseDist(numFlowCells, mapUniqueToSeq[i], mapUniqueToSeq[j], mapSeqToUnique, lengths, flowDataPrI, flowDataIntI);
2539                 
2540                                 if(flowDistance < 1e-6){
2541                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << 0.000000 << endl;
2542                                 }
2543                                 else if(flowDistance <= cutoff){
2544                                         outStream << mapUniqueToSeq[i] << '\t' << mapUniqueToSeq[j] << '\t' << flowDistance << endl;
2545                                 }
2546                         }
2547                         if(i % 100 == 0){
2548                                 m->mothurOutJustToScreen(toString(i) + "\t" + toString(time(NULL) - begTime));
2549                                 m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2550                         }
2551                 }
2552                 
2553                 ofstream distFile(distFileName.c_str());
2554                 distFile << outStream.str();            
2555                 distFile.close();
2556                 
2557                 if (m->control_pressed) {}
2558                 else {
2559                         m->mothurOutJustToScreen(toString(stopSeq-1) + "\t" + toString(time(NULL) - begTime));
2560                         m->mothurOutJustToScreen("\t" + toString((clock()-begClock)/CLOCKS_PER_SEC)+"\n");
2561                 }
2562         
2563         return 0;
2564         }
2565         catch(exception& e) {
2566                 m->errorOut(e, "ShhherCommand", "flowDistParentFork");
2567                 exit(1);
2568         }
2569 }
2570 /**************************************************************************************************/
2571
2572 float ShhherCommand::calcPairwiseDist(int numFlowCells, int seqA, int seqB, vector<int>& mapSeqToUnique, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2573         try{
2574                 int minLength = lengths[mapSeqToUnique[seqA]];
2575                 if(lengths[seqB] < minLength){  minLength = lengths[mapSeqToUnique[seqB]];      }
2576                 
2577                 int ANumFlowCells = seqA * numFlowCells;
2578                 int BNumFlowCells = seqB * numFlowCells;
2579                 
2580                 float dist = 0;
2581                 
2582                 for(int i=0;i<minLength;i++){
2583                         
2584                         if (m->control_pressed) { break; }
2585                         
2586                         int flowAIntI = flowDataIntI[ANumFlowCells + i];
2587                         float flowAPrI = flowDataPrI[ANumFlowCells + i];
2588                         
2589                         int flowBIntI = flowDataIntI[BNumFlowCells + i];
2590                         float flowBPrI = flowDataPrI[BNumFlowCells + i];
2591                         dist += jointLookUp[flowAIntI * NUMBINS + flowBIntI] - flowAPrI - flowBPrI;
2592                 }
2593                 
2594                 dist /= (float) minLength;
2595                 return dist;
2596         }
2597         catch(exception& e) {
2598                 m->errorOut(e, "ShhherCommand", "calcPairwiseDist");
2599                 exit(1);
2600         }
2601 }
2602
2603 /**************************************************************************************************/
2604
2605 int ShhherCommand::getUniques(int numSeqs, int numFlowCells, vector<short>& uniqueFlowgrams, vector<int>& uniqueCount, vector<int>& uniqueLengths, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq, vector<int>& lengths, vector<double>& flowDataPrI, vector<short>& flowDataIntI){
2606         try{
2607                 int numUniques = 0;
2608                 uniqueFlowgrams.assign(numFlowCells * numSeqs, -1);
2609                 uniqueCount.assign(numSeqs, 0);                                                 //      anWeights
2610                 uniqueLengths.assign(numSeqs, 0);
2611                 mapSeqToUnique.assign(numSeqs, -1);
2612                 mapUniqueToSeq.assign(numSeqs, -1);
2613                 
2614                 vector<short> uniqueFlowDataIntI(numFlowCells * numSeqs, -1);
2615                 
2616                 for(int i=0;i<numSeqs;i++){
2617                         
2618                         if (m->control_pressed) { break; }
2619                         
2620                         int index = 0;
2621                         
2622                         vector<short> current(numFlowCells);
2623                         for(int j=0;j<numFlowCells;j++){
2624                                 current[j] = short(((flowDataIntI[i * numFlowCells + j] + 50.0)/100.0));
2625                         }
2626             
2627                         for(int j=0;j<numUniques;j++){
2628                                 int offset = j * numFlowCells;
2629                                 bool toEnd = 1;
2630                                 
2631                                 int shorterLength;
2632                                 if(lengths[i] < uniqueLengths[j])       {       shorterLength = lengths[i];                     }
2633                                 else                                                            {       shorterLength = uniqueLengths[j];       }
2634                 
2635                                 for(int k=0;k<shorterLength;k++){
2636                                         if(current[k] != uniqueFlowgrams[offset + k]){
2637                                                 toEnd = 0;
2638                                                 break;
2639                                         }
2640                                 }
2641                                 
2642                                 if(toEnd){
2643                                         mapSeqToUnique[i] = j;
2644                                         uniqueCount[j]++;
2645                                         index = j;
2646                                         if(lengths[i] > uniqueLengths[j])       {       uniqueLengths[j] = lengths[i];  }
2647                                         break;
2648                                 }
2649                                 index++;
2650                         }
2651                         
2652                         if(index == numUniques){
2653                                 uniqueLengths[numUniques] = lengths[i];
2654                                 uniqueCount[numUniques] = 1;
2655                                 mapSeqToUnique[i] = numUniques;//anMap
2656                                 mapUniqueToSeq[numUniques] = i;//anF
2657                                 
2658                                 for(int k=0;k<numFlowCells;k++){
2659                                         uniqueFlowgrams[numUniques * numFlowCells + k] = current[k];
2660                                         uniqueFlowDataIntI[numUniques * numFlowCells + k] = flowDataIntI[i * numFlowCells + k];
2661                                 }
2662                                 
2663                                 numUniques++;
2664                         }
2665                 }
2666                 uniqueFlowDataIntI.resize(numFlowCells * numUniques);
2667                 uniqueLengths.resize(numUniques);       
2668                 
2669                 flowDataPrI.resize(numSeqs * numFlowCells, 0);
2670                 for(int i=0;i<flowDataPrI.size();i++)   {       if (m->control_pressed) { break; } flowDataPrI[i] = getProbIntensity(flowDataIntI[i]);          }
2671         
2672         return numUniques;
2673         }
2674         catch(exception& e) {
2675                 m->errorOut(e, "ShhherCommand", "getUniques");
2676                 exit(1);
2677         }
2678 }
2679 /**************************************************************************************************/
2680 int ShhherCommand::createNamesFile(int numSeqs, int numUniques, string filename, vector<string>& seqNameVector, vector<int>& mapSeqToUnique, vector<int>& mapUniqueToSeq){
2681         try{
2682                 
2683                 vector<string> duplicateNames(numUniques, "");
2684                 for(int i=0;i<numSeqs;i++){
2685                         duplicateNames[mapSeqToUnique[i]] += seqNameVector[i] + ',';
2686                 }
2687                 
2688                 ofstream nameFile;
2689                 m->openOutputFile(filename, nameFile);
2690                 
2691                 for(int i=0;i<numUniques;i++){
2692                         
2693                         if (m->control_pressed) { break; }
2694                         
2695             //                  nameFile << seqNameVector[mapUniqueToSeq[i]] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2696                         nameFile << mapUniqueToSeq[i] << '\t' << duplicateNames[i].substr(0, duplicateNames[i].find_last_of(',')) << endl;
2697                 }
2698                 
2699                 nameFile.close();
2700         
2701                 return 0;
2702         }
2703         catch(exception& e) {
2704                 m->errorOut(e, "ShhherCommand", "createNamesFile");
2705                 exit(1);
2706         }
2707 }
2708 //**********************************************************************************************************************
2709
2710 int ShhherCommand::cluster(string filename, string distFileName, string namesFileName){
2711         try {
2712                 
2713                 ReadMatrix* read = new ReadColumnMatrix(distFileName);  
2714                 read->setCutoff(cutoff);
2715                 
2716                 NameAssignment* clusterNameMap = new NameAssignment(namesFileName);
2717                 clusterNameMap->readMap();
2718                 read->read(clusterNameMap);
2719         
2720                 ListVector* list = read->getListVector();
2721                 SparseDistanceMatrix* matrix = read->getDMatrix();
2722                 
2723                 delete read; 
2724                 delete clusterNameMap; 
2725         
2726                 RAbundVector* rabund = new RAbundVector(list->getRAbundVector());
2727                 
2728                 Cluster* cluster = new CompleteLinkage(rabund, list, matrix, cutoff, "furthest"); 
2729                 string tag = cluster->getTag();
2730                 
2731                 double clusterCutoff = cutoff;
2732                 while (matrix->getSmallDist() <= clusterCutoff && matrix->getNNodes() > 0){
2733                         
2734                         if (m->control_pressed) { break; }
2735                         
2736                         cluster->update(clusterCutoff);
2737                 }
2738                 
2739                 list->setLabel(toString(cutoff));
2740                 
2741                 ofstream listFile;
2742                 m->openOutputFile(filename, listFile);
2743                 list->print(listFile);
2744                 listFile.close();
2745                 
2746                 delete matrix;  delete cluster; delete rabund; delete list;
2747         
2748                 return 0;
2749         }
2750         catch(exception& e) {
2751                 m->errorOut(e, "ShhherCommand", "cluster");
2752                 exit(1);        
2753         }               
2754 }
2755 /**************************************************************************************************/
2756
2757 int ShhherCommand::getOTUData(int numSeqs, string fileName,  vector<int>& otuData,
2758                                vector<int>& cumNumSeqs,
2759                                vector<int>& nSeqsPerOTU,
2760                                vector<vector<int> >& aaP,       //tMaster->aanP:        each row is a different otu / each col contains the sequence indices
2761                                vector<vector<int> >& aaI,       //tMaster->aanI:        that are in each otu - can't differentiate between aaP and aaI 
2762                                vector<int>& seqNumber,          //tMaster->anP:         the sequence id number sorted by OTU
2763                                vector<int>& seqIndex,
2764                                map<string, int>& nameMap){
2765         try {
2766         
2767                 ifstream listFile;
2768                 m->openInputFile(fileName, listFile);
2769                 string label;
2770         int numOTUs;
2771                 
2772                 listFile >> label >> numOTUs;
2773         
2774         if (m->debug) { m->mothurOut("[DEBUG]: Getting OTU Data...\n"); }
2775         
2776                 otuData.assign(numSeqs, 0);
2777                 cumNumSeqs.assign(numOTUs, 0);
2778                 nSeqsPerOTU.assign(numOTUs, 0);
2779                 aaP.clear();aaP.resize(numOTUs);
2780                 
2781                 seqNumber.clear();
2782                 aaI.clear();
2783                 seqIndex.clear();
2784                 
2785                 string singleOTU = "";
2786                 
2787                 for(int i=0;i<numOTUs;i++){
2788                         
2789                         if (m->control_pressed) { break; }
2790             if (m->debug) { m->mothurOut("[DEBUG]: processing OTU " + toString(i) + ".\n"); }
2791             
2792                         listFile >> singleOTU;
2793                         
2794                         istringstream otuString(singleOTU);
2795             
2796                         while(otuString){
2797                                 
2798                                 string seqName = "";
2799                                 
2800                                 for(int j=0;j<singleOTU.length();j++){
2801                                         char letter = otuString.get();
2802                                         
2803                                         if(letter != ','){
2804                                                 seqName += letter;
2805                                         }
2806                                         else{
2807                                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2808                                                 int index = nmIt->second;
2809                                                 
2810                                                 nameMap.erase(nmIt);
2811                                                 
2812                                                 otuData[index] = i;
2813                                                 nSeqsPerOTU[i]++;
2814                                                 aaP[i].push_back(index);
2815                                                 seqName = "";
2816                                         }
2817                                 }
2818                                 
2819                                 map<string,int>::iterator nmIt = nameMap.find(seqName);
2820                 
2821                                 int index = nmIt->second;
2822                                 nameMap.erase(nmIt);
2823                 
2824                                 otuData[index] = i;
2825                                 nSeqsPerOTU[i]++;
2826                                 aaP[i].push_back(index);        
2827                                 
2828                                 otuString.get();
2829                         }
2830                         
2831                         sort(aaP[i].begin(), aaP[i].end());
2832                         for(int j=0;j<nSeqsPerOTU[i];j++){
2833                                 seqNumber.push_back(aaP[i][j]);
2834                         }
2835                         for(int j=nSeqsPerOTU[i];j<numSeqs;j++){
2836                                 aaP[i].push_back(0);
2837                         }
2838                         
2839                         
2840                 }
2841                 
2842                 for(int i=1;i<numOTUs;i++){
2843                         cumNumSeqs[i] = cumNumSeqs[i-1] + nSeqsPerOTU[i-1];
2844                 }
2845                 aaI = aaP;
2846                 seqIndex = seqNumber;
2847                 
2848                 listFile.close();       
2849         
2850         return numOTUs;
2851                 
2852         }
2853         catch(exception& e) {
2854                 m->errorOut(e, "ShhherCommand", "getOTUData");
2855                 exit(1);        
2856         }               
2857 }
2858 /**************************************************************************************************/
2859
2860 int ShhherCommand::calcCentroidsDriver(int numOTUs, 
2861                                           vector<int>& cumNumSeqs,
2862                                           vector<int>& nSeqsPerOTU,
2863                                           vector<int>& seqIndex,
2864                                           vector<short>& change,                //did the centroid sequence change? 0 = no; 1 = yes
2865                                           vector<int>& centroids,               //the representative flowgram for each cluster m
2866                                           vector<double>& singleTau,    //tMaster->adTau:       1-D Tau vector (1xnumSeqs)
2867                                           vector<int>& mapSeqToUnique,
2868                                           vector<short>& uniqueFlowgrams,
2869                                           vector<short>& flowDataIntI,
2870                                           vector<int>& lengths,
2871                                           int numFlowCells,
2872                                           vector<int>& seqNumber){                          
2873         
2874         //this function gets the most likely homopolymer length at a flow position for a group of sequences
2875         //within an otu
2876         
2877         try{
2878                 
2879                 for(int i=0;i<numOTUs;i++){
2880                         
2881                         if (m->control_pressed) { break; }
2882                         
2883                         double count = 0;
2884                         int position = 0;
2885                         int minFlowGram = 100000000;
2886                         double minFlowValue = 1e8;
2887                         change[i] = 0; //FALSE
2888                         
2889                         for(int j=0;j<nSeqsPerOTU[i];j++){
2890                                 count += singleTau[seqNumber[cumNumSeqs[i] + j]];
2891                         }
2892             
2893                         if(nSeqsPerOTU[i] > 0 && count > MIN_COUNT){
2894                                 vector<double> adF(nSeqsPerOTU[i]);
2895                                 vector<int> anL(nSeqsPerOTU[i]);
2896                                 
2897                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2898                                         int index = cumNumSeqs[i] + j;
2899                                         int nI = seqIndex[index];
2900                                         int nIU = mapSeqToUnique[nI];
2901                                         
2902                                         int k;
2903                                         for(k=0;k<position;k++){
2904                                                 if(nIU == anL[k]){
2905                                                         break;
2906                                                 }
2907                                         }
2908                                         if(k == position){
2909                                                 anL[position] = nIU;
2910                                                 adF[position] = 0.0000;
2911                                                 position++;
2912                                         }                                               
2913                                 }
2914                                 
2915                                 for(int j=0;j<nSeqsPerOTU[i];j++){
2916                                         int index = cumNumSeqs[i] + j;
2917                                         int nI = seqIndex[index];
2918                                         
2919                                         double tauValue = singleTau[seqNumber[index]];
2920                                         
2921                                         for(int k=0;k<position;k++){
2922                                                 double dist = getDistToCentroid(anL[k], nI, lengths[nI], uniqueFlowgrams, flowDataIntI, numFlowCells);
2923                                                 adF[k] += dist * tauValue;
2924                                         }
2925                                 }
2926                                 
2927                                 for(int j=0;j<position;j++){
2928                                         if(adF[j] < minFlowValue){
2929                                                 minFlowGram = j;
2930                                                 minFlowValue = adF[j];
2931                                         }
2932                                 }
2933                                 
2934                                 if(centroids[i] != anL[minFlowGram]){
2935                                         change[i] = 1;
2936                                         centroids[i] = anL[minFlowGram];
2937                                 }
2938                         }
2939                         else if(centroids[i] != -1){
2940                                 change[i] = 1;
2941                                 centroids[i] = -1;                      
2942                         }
2943                 }
2944         
2945         return 0;
2946         }
2947         catch(exception& e) {
2948                 m->errorOut(e, "ShhherCommand", "calcCentroidsDriver");
2949                 exit(1);        
2950         }               
2951 }
2952 /**************************************************************************************************/
2953
2954 double ShhherCommand::getDistToCentroid(int cent, int flow, int length, vector<short>& uniqueFlowgrams,
2955                                         vector<short>& flowDataIntI, int numFlowCells){
2956         try{
2957                 
2958                 int flowAValue = cent * numFlowCells;
2959                 int flowBValue = flow * numFlowCells;
2960                 
2961                 double dist = 0;
2962         
2963                 for(int i=0;i<length;i++){
2964                         dist += singleLookUp[uniqueFlowgrams[flowAValue] * NUMBINS + flowDataIntI[flowBValue]];
2965                         flowAValue++;
2966                         flowBValue++;
2967                 }
2968                 
2969                 return dist / (double)length;
2970         }
2971         catch(exception& e) {
2972                 m->errorOut(e, "ShhherCommand", "getDistToCentroid");
2973                 exit(1);        
2974         }               
2975 }
2976 /**************************************************************************************************/
2977
2978 double ShhherCommand::getNewWeights(int numOTUs, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<double>& singleTau, vector<int>& seqNumber, vector<double>& weight){
2979         try{
2980                 
2981                 double maxChange = 0;
2982                 
2983                 for(int i=0;i<numOTUs;i++){
2984                         
2985                         if (m->control_pressed) { break; }
2986                         
2987                         double difference = weight[i];
2988                         weight[i] = 0;
2989                         
2990                         for(int j=0;j<nSeqsPerOTU[i];j++){
2991                                 int index = cumNumSeqs[i] + j;
2992                                 double tauValue = singleTau[seqNumber[index]];
2993                                 weight[i] += tauValue;
2994                         }
2995                         
2996                         difference = fabs(weight[i] - difference);
2997                         if(difference > maxChange){     maxChange = difference; }
2998                 }
2999                 return maxChange;
3000         }
3001         catch(exception& e) {
3002                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3003                 exit(1);        
3004         }               
3005 }
3006
3007 /**************************************************************************************************/
3008
3009 double ShhherCommand::getLikelihood(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<int>& seqNumber, vector<int>& cumNumSeqs, vector<int>& seqIndex, vector<double>& dist, vector<double>& weight){
3010         
3011         try{
3012                 
3013                 vector<long double> P(numSeqs, 0);
3014                 int effNumOTUs = 0;
3015                 
3016                 for(int i=0;i<numOTUs;i++){
3017                         if(weight[i] > MIN_WEIGHT){
3018                                 effNumOTUs++;
3019                         }
3020                 }
3021                 
3022                 string hold;
3023                 for(int i=0;i<numOTUs;i++){
3024                         
3025                         if (m->control_pressed) { break; }
3026                         
3027                         for(int j=0;j<nSeqsPerOTU[i];j++){
3028                                 int index = cumNumSeqs[i] + j;
3029                                 int nI = seqIndex[index];
3030                                 double singleDist = dist[seqNumber[index]];
3031                                 
3032                                 P[nI] += weight[i] * exp(-singleDist * sigma);
3033                         }
3034                 }
3035                 double nLL = 0.00;
3036                 for(int i=0;i<numSeqs;i++){
3037                         if(P[i] == 0){  P[i] = DBL_EPSILON;     }
3038             
3039                         nLL += -log(P[i]);
3040                 }
3041                 
3042                 nLL = nLL -(double)numSeqs * log(sigma);
3043         
3044                 return nLL; 
3045         }
3046         catch(exception& e) {
3047                 m->errorOut(e, "ShhherCommand", "getNewWeights");
3048                 exit(1);        
3049         }               
3050 }
3051
3052 /**************************************************************************************************/
3053
3054 int ShhherCommand::checkCentroids(int numOTUs, vector<int>& centroids, vector<double>& weight){
3055         try{
3056                 vector<int> unique(numOTUs, 1);
3057                 
3058                 for(int i=0;i<numOTUs;i++){
3059                         if(centroids[i] == -1 || weight[i] < MIN_WEIGHT){
3060                                 unique[i] = -1;
3061                         }
3062                 }
3063                 
3064                 for(int i=0;i<numOTUs;i++){
3065                         
3066                         if (m->control_pressed) { break; }
3067                         
3068                         if(unique[i] == 1){
3069                                 for(int j=i+1;j<numOTUs;j++){
3070                                         if(unique[j] == 1){
3071                                                 
3072                                                 if(centroids[j] == centroids[i]){
3073                                                         unique[j] = 0;
3074                                                         centroids[j] = -1;
3075                                                         
3076                                                         weight[i] += weight[j];
3077                                                         weight[j] = 0.0;
3078                                                 }
3079                                         }
3080                                 }
3081                         }
3082                 }
3083         
3084         return 0;
3085         }
3086         catch(exception& e) {
3087                 m->errorOut(e, "ShhherCommand", "checkCentroids");
3088                 exit(1);        
3089         }               
3090 }
3091 /**************************************************************************************************/
3092
3093 void ShhherCommand::calcNewDistances(int numSeqs, int numOTUs, vector<int>& nSeqsPerOTU, vector<double>& dist, 
3094                                      vector<double>& weight, vector<short>& change, vector<int>& centroids,
3095                                      vector<vector<int> >& aaP, vector<double>& singleTau, vector<vector<int> >& aaI,   
3096                                      vector<int>& seqNumber, vector<int>& seqIndex,
3097                                      vector<short>& uniqueFlowgrams,
3098                                      vector<short>& flowDataIntI, int numFlowCells, vector<int>& lengths){
3099         
3100         try{
3101                 
3102                 int total = 0;
3103                 vector<double> newTau(numOTUs,0);
3104                 vector<double> norms(numSeqs, 0);
3105                 nSeqsPerOTU.assign(numOTUs, 0);
3106         
3107                 for(int i=0;i<numSeqs;i++){
3108                         
3109                         if (m->control_pressed) { break; }
3110                         
3111                         int indexOffset = i * numOTUs;
3112             
3113                         double offset = 1e8;
3114                         
3115                         for(int j=0;j<numOTUs;j++){
3116                 
3117                                 if(weight[j] > MIN_WEIGHT && change[j] == 1){
3118                                         dist[indexOffset + j] = getDistToCentroid(centroids[j], i, lengths[i], uniqueFlowgrams, flowDataIntI, numFlowCells);
3119                                 }
3120                 
3121                                 if(weight[j] > MIN_WEIGHT && dist[indexOffset + j] < offset){
3122                                         offset = dist[indexOffset + j];
3123                                 }
3124                         }
3125             
3126                         for(int j=0;j<numOTUs;j++){
3127                                 if(weight[j] > MIN_WEIGHT){
3128                                         newTau[j] = exp(sigma * (-dist[indexOffset + j] + offset)) * weight[j];
3129                                         norms[i] += newTau[j];
3130                                 }
3131                                 else{
3132                                         newTau[j] = 0.0;
3133                                 }
3134                         }
3135             
3136                         for(int j=0;j<numOTUs;j++){
3137                                 newTau[j] /= norms[i];
3138                         }
3139             
3140                         for(int j=0;j<numOTUs;j++){
3141                                 if(newTau[j] > MIN_TAU){
3142                                         
3143                                         int oldTotal = total;
3144                                         
3145                                         total++;
3146                                         
3147                                         singleTau.resize(total, 0);
3148                                         seqNumber.resize(total, 0);
3149                                         seqIndex.resize(total, 0);
3150                                         
3151                                         singleTau[oldTotal] = newTau[j];
3152                                         
3153                                         aaP[j][nSeqsPerOTU[j]] = oldTotal;
3154                                         aaI[j][nSeqsPerOTU[j]] = i;
3155                                         nSeqsPerOTU[j]++;
3156                                 }
3157                         }
3158             
3159                 }
3160         
3161         }
3162         catch(exception& e) {
3163                 m->errorOut(e, "ShhherCommand", "calcNewDistances");
3164                 exit(1);        
3165         }               
3166 }
3167 /**************************************************************************************************/
3168
3169 int ShhherCommand::fill(int numOTUs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3170         try {
3171                 int index = 0;
3172                 for(int i=0;i<numOTUs;i++){
3173                         
3174                         if (m->control_pressed) { return 0; }
3175                         
3176                         cumNumSeqs[i] = index;
3177                         for(int j=0;j<nSeqsPerOTU[i];j++){
3178                                 seqNumber[index] = aaP[i][j];
3179                                 seqIndex[index] = aaI[i][j];
3180                                 
3181                                 index++;
3182                         }
3183                 }
3184         
3185         return 0; 
3186         }
3187         catch(exception& e) {
3188                 m->errorOut(e, "ShhherCommand", "fill");
3189                 exit(1);        
3190         }               
3191 }
3192 /**************************************************************************************************/
3193
3194 void ShhherCommand::setOTUs(int numOTUs, int numSeqs, vector<int>& seqNumber, vector<int>& seqIndex, vector<int>& cumNumSeqs, vector<int>& nSeqsPerOTU,
3195                             vector<int>& otuData, vector<double>& singleTau, vector<double>& dist, vector<vector<int> >& aaP, vector<vector<int> >& aaI){
3196         
3197         try {
3198                 vector<double> bigTauMatrix(numOTUs * numSeqs, 0.0000);
3199                 
3200                 for(int i=0;i<numOTUs;i++){
3201                         
3202                         if (m->control_pressed) { break; }
3203                         
3204                         for(int j=0;j<nSeqsPerOTU[i];j++){
3205                                 int index = cumNumSeqs[i] + j;
3206                                 double tauValue = singleTau[seqNumber[index]];
3207                                 int sIndex = seqIndex[index];
3208                                 bigTauMatrix[sIndex * numOTUs + i] = tauValue;                          
3209                         }
3210                 }
3211                 
3212                 for(int i=0;i<numSeqs;i++){
3213                         double maxTau = -1.0000;
3214                         int maxOTU = -1;
3215                         for(int j=0;j<numOTUs;j++){
3216                                 if(bigTauMatrix[i * numOTUs + j] > maxTau){
3217                                         maxTau = bigTauMatrix[i * numOTUs + j];
3218                                         maxOTU = j;
3219                                 }
3220                         }
3221                         
3222                         otuData[i] = maxOTU;
3223                 }
3224                 
3225                 nSeqsPerOTU.assign(numOTUs, 0);         
3226                 
3227                 for(int i=0;i<numSeqs;i++){
3228                         int index = otuData[i];
3229                         
3230                         singleTau[i] = 1.0000;
3231                         dist[i] = 0.0000;
3232                         
3233                         aaP[index][nSeqsPerOTU[index]] = i;
3234                         aaI[index][nSeqsPerOTU[index]] = i;
3235                         
3236                         nSeqsPerOTU[index]++;
3237                 }
3238         
3239                 fill(numOTUs, seqNumber, seqIndex, cumNumSeqs, nSeqsPerOTU, aaP, aaI);  
3240         }
3241         catch(exception& e) {
3242                 m->errorOut(e, "ShhherCommand", "setOTUs");
3243                 exit(1);        
3244         }               
3245 }
3246 /**************************************************************************************************/
3247
3248 void ShhherCommand::writeQualities(int numOTUs, int numFlowCells, string qualityFileName, vector<int> otuCounts, vector<int>& nSeqsPerOTU, vector<int>& seqNumber,
3249                                    vector<double>& singleTau, vector<short>& flowDataIntI, vector<short>& uniqueFlowgrams, vector<int>& cumNumSeqs,
3250                                    vector<int>& mapUniqueToSeq, vector<string>& seqNameVector, vector<int>& centroids, vector<vector<int> >& aaI){
3251         
3252         try {
3253         
3254                 ofstream qualityFile;
3255                 m->openOutputFile(qualityFileName, qualityFile);
3256         
3257                 qualityFile.setf(ios::fixed, ios::floatfield);
3258                 qualityFile.setf(ios::showpoint);
3259                 qualityFile << setprecision(6);
3260                 
3261                 vector<vector<int> > qualities(numOTUs);
3262                 vector<double> pr(HOMOPS, 0);
3263                 
3264                 
3265                 for(int i=0;i<numOTUs;i++){
3266                         
3267                         if (m->control_pressed) { break; }
3268                         
3269                         int index = 0;
3270                         int base = 0;
3271                         
3272                         if(nSeqsPerOTU[i] > 0){
3273                                 qualities[i].assign(1024, -1);
3274                                 
3275                                 while(index < numFlowCells){
3276                                         double maxPrValue = 1e8;
3277                                         short maxPrIndex = -1;
3278                                         double count = 0.0000;
3279                                         
3280                                         pr.assign(HOMOPS, 0);
3281                                         
3282                                         for(int j=0;j<nSeqsPerOTU[i];j++){
3283                                                 int lIndex = cumNumSeqs[i] + j;
3284                                                 double tauValue = singleTau[seqNumber[lIndex]];
3285                                                 int sequenceIndex = aaI[i][j];
3286                                                 short intensity = flowDataIntI[sequenceIndex * numFlowCells + index];
3287                                                 
3288                                                 count += tauValue;
3289                                                 
3290                                                 for(int s=0;s<HOMOPS;s++){
3291                                                         pr[s] += tauValue * singleLookUp[s * NUMBINS + intensity];
3292                                                 }
3293                                         }
3294                                         
3295                                         maxPrIndex = uniqueFlowgrams[centroids[i] * numFlowCells + index];
3296                                         maxPrValue = pr[maxPrIndex];
3297                                         
3298                                         if(count > MIN_COUNT){
3299                                                 double U = 0.0000;
3300                                                 double norm = 0.0000;
3301                                                 
3302                                                 for(int s=0;s<HOMOPS;s++){
3303                                                         norm += exp(-(pr[s] - maxPrValue));
3304                                                 }
3305                                                 
3306                                                 for(int s=1;s<=maxPrIndex;s++){
3307                                                         int value = 0;
3308                                                         double temp = 0.0000;
3309                                                         
3310                                                         U += exp(-(pr[s-1]-maxPrValue))/norm;
3311                                                         
3312                                                         if(U>0.00){
3313                                                                 temp = log10(U);
3314                                                         }
3315                                                         else{
3316                                                                 temp = -10.1;
3317                                                         }
3318                                                         temp = floor(-10 * temp);
3319                                                         value = (int)floor(temp);
3320                                                         if(value > 100){        value = 100;    }
3321                                                         
3322                                                         qualities[i][base] = (int)value;
3323                                                         base++;
3324                                                 }
3325                                         }
3326                                         
3327                                         index++;
3328                                 }
3329                         }
3330                         
3331                         
3332                         if(otuCounts[i] > 0){
3333                                 qualityFile << '>' << seqNameVector[mapUniqueToSeq[i]] << endl;
3334                         
3335                                 int j=4;        //need to get past the first four bases
3336                                 while(qualities[i][j] != -1){
3337                     qualityFile << qualities[i][j] << ' ';
3338                     if (j > qualities[i].size()) { break; }
3339                     j++;
3340                                 }
3341                                 qualityFile << endl;
3342                         }
3343                 }
3344                 qualityFile.close();
3345                 outputNames.push_back(qualityFileName); outputTypes["qfile"].push_back(qualityFileName);
3346         
3347         }
3348         catch(exception& e) {
3349                 m->errorOut(e, "ShhherCommand", "writeQualities");
3350                 exit(1);        
3351         }               
3352 }
3353
3354 /**************************************************************************************************/
3355
3356 void ShhherCommand::writeSequences(string thisCompositeFASTAFileName, int numOTUs, int numFlowCells, string fastaFileName, vector<int> otuCounts, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& centroids){
3357         try {
3358                 
3359                 ofstream fastaFile;
3360                 m->openOutputFile(fastaFileName, fastaFile);
3361                 
3362                 vector<string> names(numOTUs, "");
3363                 
3364                 for(int i=0;i<numOTUs;i++){
3365                         
3366                         if (m->control_pressed) { break; }
3367                         
3368                         int index = centroids[i];
3369                         
3370                         if(otuCounts[i] > 0){
3371                                 fastaFile << '>' << seqNameVector[aaI[i][0]] << endl;
3372                                 
3373                                 string newSeq = "";
3374                                 
3375                                 for(int j=0;j<numFlowCells;j++){
3376                                         
3377                                         char base = flowOrder[j % flowOrder.length()];
3378                                         for(int k=0;k<uniqueFlowgrams[index * numFlowCells + j];k++){
3379                                                 newSeq += base;
3380                                         }
3381                                 }
3382                                 
3383                                 if (newSeq.length() >= 4) {  fastaFile << newSeq.substr(4) << endl;  }
3384                 else {  fastaFile << "NNNN" << endl;  }
3385                         }
3386                 }
3387                 fastaFile.close();
3388         
3389                 outputNames.push_back(fastaFileName); outputTypes["fasta"].push_back(fastaFileName);
3390         
3391                 if(thisCompositeFASTAFileName != ""){
3392                         m->appendFiles(fastaFileName, thisCompositeFASTAFileName);
3393                 }
3394         }
3395         catch(exception& e) {
3396                 m->errorOut(e, "ShhherCommand", "writeSequences");
3397                 exit(1);        
3398         }               
3399 }
3400
3401 /**************************************************************************************************/
3402
3403 void ShhherCommand::writeNames(string thisCompositeNamesFileName, int numOTUs, string nameFileName, vector<int> otuCounts, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU){
3404         try {
3405                 
3406                 ofstream nameFile;
3407                 m->openOutputFile(nameFileName, nameFile);
3408                 
3409                 for(int i=0;i<numOTUs;i++){
3410                         
3411                         if (m->control_pressed) { break; }
3412                         
3413                         if(otuCounts[i] > 0){
3414                                 nameFile << seqNameVector[aaI[i][0]] << '\t' << seqNameVector[aaI[i][0]];
3415                                 
3416                                 for(int j=1;j<nSeqsPerOTU[i];j++){
3417                                         nameFile << ',' << seqNameVector[aaI[i][j]];
3418                                 }
3419                                 
3420                                 nameFile << endl;
3421                         }
3422                 }
3423                 nameFile.close();
3424                 outputNames.push_back(nameFileName); outputTypes["name"].push_back(nameFileName);
3425                 
3426                 
3427                 if(thisCompositeNamesFileName != ""){
3428                         m->appendFiles(nameFileName, thisCompositeNamesFileName);
3429                 }               
3430         }
3431         catch(exception& e) {
3432                 m->errorOut(e, "ShhherCommand", "writeNames");
3433                 exit(1);        
3434         }               
3435 }
3436
3437 /**************************************************************************************************/
3438
3439 void ShhherCommand::writeGroups(string groupFileName, string fileRoot, int numSeqs, vector<string>& seqNameVector){
3440         try {
3441         ofstream groupFile;
3442                 m->openOutputFile(groupFileName, groupFile);
3443                 
3444                 for(int i=0;i<numSeqs;i++){
3445                         if (m->control_pressed) { break; }
3446                         groupFile << seqNameVector[i] << '\t' << fileRoot << endl;
3447                 }
3448                 groupFile.close();
3449                 outputNames.push_back(groupFileName); outputTypes["group"].push_back(groupFileName);
3450         
3451         }
3452         catch(exception& e) {
3453                 m->errorOut(e, "ShhherCommand", "writeGroups");
3454                 exit(1);        
3455         }               
3456 }
3457
3458 /**************************************************************************************************/
3459
3460 void ShhherCommand::writeClusters(string otuCountsFileName, int numOTUs, int numFlowCells, vector<int> otuCounts, vector<int>& centroids, vector<short>& uniqueFlowgrams, vector<string>& seqNameVector, vector<vector<int> >& aaI, vector<int>& nSeqsPerOTU, vector<int>& lengths, vector<short>& flowDataIntI){
3461         try {
3462                 ofstream otuCountsFile;
3463                 m->openOutputFile(otuCountsFileName, otuCountsFile);
3464                 
3465                 string bases = flowOrder;
3466                 
3467                 for(int i=0;i<numOTUs;i++){
3468                         
3469                         if (m->control_pressed) {
3470                                 break;
3471                         }
3472                         //output the translated version of the centroid sequence for the otu
3473                         if(otuCounts[i] > 0){
3474                                 int index = centroids[i];
3475                                 
3476                                 otuCountsFile << "ideal\t";
3477                                 for(int j=8;j<numFlowCells;j++){
3478                                         char base = bases[j % bases.length()];
3479                                         for(int s=0;s<uniqueFlowgrams[index * numFlowCells + j];s++){
3480                                                 otuCountsFile << base;
3481                                         }
3482                                 }
3483                                 otuCountsFile << endl;
3484                                 
3485                                 for(int j=0;j<nSeqsPerOTU[i];j++){
3486                                         int sequence = aaI[i][j];
3487                                         otuCountsFile << seqNameVector[sequence] << '\t';
3488                                         
3489                                         string newSeq = "";
3490                                         
3491                                         for(int k=0;k<lengths[sequence];k++){
3492                                                 char base = bases[k % bases.length()];
3493                                                 int freq = int(0.01 * (double)flowDataIntI[sequence * numFlowCells + k] + 0.5);
3494                         
3495                                                 for(int s=0;s<freq;s++){
3496                                                         newSeq += base;
3497                                                         //otuCountsFile << base;
3498                                                 }
3499                                         }
3500                                         
3501                     if (newSeq.length() >= 4) {  otuCountsFile << newSeq.substr(4) << endl;  }
3502                     else {  otuCountsFile << "NNNN" << endl;  }
3503                                 }
3504                                 otuCountsFile << endl;
3505                         }
3506                 }
3507                 otuCountsFile.close();
3508                 outputNames.push_back(otuCountsFileName); outputTypes["counts"].push_back(otuCountsFileName);
3509         
3510         }
3511         catch(exception& e) {
3512                 m->errorOut(e, "ShhherCommand", "writeClusters");
3513                 exit(1);        
3514         }               
3515 }
3516
3517 /**************************************************************************************************/
3518
3519 void ShhherCommand::getSingleLookUp(){
3520         try{
3521                 //      these are the -log probabilities that a signal corresponds to a particular homopolymer length
3522                 singleLookUp.assign(HOMOPS * NUMBINS, 0);
3523                 
3524                 int index = 0;
3525                 ifstream lookUpFile;
3526                 m->openInputFile(lookupFileName, lookUpFile);
3527                 
3528                 for(int i=0;i<HOMOPS;i++){
3529                         
3530                         if (m->control_pressed) { break; }
3531                         
3532                         float logFracFreq;
3533                         lookUpFile >> logFracFreq;
3534                         
3535                         for(int j=0;j<NUMBINS;j++)      {
3536                                 lookUpFile >> singleLookUp[index];
3537                                 index++;
3538                         }
3539                 }       
3540                 lookUpFile.close();
3541         }
3542         catch(exception& e) {
3543                 m->errorOut(e, "ShhherCommand", "getSingleLookUp");
3544                 exit(1);
3545         }
3546 }
3547
3548 /**************************************************************************************************/
3549
3550 void ShhherCommand::getJointLookUp(){
3551         try{
3552                 
3553                 //      the most likely joint probability (-log) that two intenities have the same polymer length
3554                 jointLookUp.resize(NUMBINS * NUMBINS, 0);
3555                 
3556                 for(int i=0;i<NUMBINS;i++){
3557                         
3558                         if (m->control_pressed) { break; }
3559                         
3560                         for(int j=0;j<NUMBINS;j++){             
3561                                 
3562                                 double minSum = 100000000;
3563                                 
3564                                 for(int k=0;k<HOMOPS;k++){
3565                                         double sum = singleLookUp[k * NUMBINS + i] + singleLookUp[k * NUMBINS + j];
3566                                         
3567                                         if(sum < minSum)        {       minSum = sum;           }
3568                                 }       
3569                                 jointLookUp[i * NUMBINS + j] = minSum;
3570                         }
3571                 }
3572         }
3573         catch(exception& e) {
3574                 m->errorOut(e, "ShhherCommand", "getJointLookUp");
3575                 exit(1);
3576         }
3577 }
3578
3579 /**************************************************************************************************/
3580
3581 double ShhherCommand::getProbIntensity(int intIntensity){                          
3582         try{
3583
3584                 double minNegLogProb = 100000000; 
3585
3586                 
3587                 for(int i=0;i<HOMOPS;i++){//loop signal strength
3588                         
3589                         if (m->control_pressed) { break; }
3590                         
3591                         float negLogProb = singleLookUp[i * NUMBINS + intIntensity];
3592                         if(negLogProb < minNegLogProb)  {       minNegLogProb = negLogProb; }
3593                 }
3594                 
3595                 return minNegLogProb;
3596         }
3597         catch(exception& e) {
3598                 m->errorOut(e, "ShhherCommand", "getProbIntensity");
3599                 exit(1);
3600         }
3601 }
3602
3603
3604
3605